Updated WebP with head change#I3da2063b

Fixes for RGBA4444 color mode w.r.t fancy upsampling and
4 bit clipping logic.

Change-Id: Ib6e58bcdb9de3713d5b874d7660c5734c9a3b104
diff --git a/ChangeLog b/ChangeLog
index e913771..06c8316 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -3,3 +3,4 @@
 - 6/11: Added encoder (version 0.1.2) as well
 - 7/11: Updated WebP with head change#Ia53f845b
 - 8/12: release version 0.2.0-rc1 (head change#Ia5475247)
+-     : Updated WebP with head change#I3da2063b
diff --git a/README.android b/README.android
index 4c866d6..787a8a9 100644
--- a/README.android
+++ b/README.android
@@ -24,6 +24,7 @@
   The fix is similar to jpeglib handling for JCS_RGB_565 & JCS_RGBA_8888
   color configs. Added the code under "ANDROID_WEBP_RGB" flag.
 - Sync-patch with libwebp ver 0.2.0-rc1 (head change#Ia5475247).
+- Updated WebP with head change#I3da2063b
 
 The Android.mk file creates WebP Decoder and Encoder static libraries which
 can be added to any application by Adding to LOCAL_STATIC_LIBRARIES
diff --git a/include/webp/format_constants.h b/include/webp/format_constants.h
index 3467fc8..7ce498f 100644
--- a/include/webp/format_constants.h
+++ b/include/webp/format_constants.h
@@ -79,9 +79,9 @@
 
 #define MAX_CANVAS_SIZE     (1 << 24)    // 24-bit max for VP8X width/height.
 #define MAX_IMAGE_AREA      (1ULL << 32) // 32-bit max for width x height.
-#define MAX_LOOP_COUNT      (1  << 16)   // maximum value for loop-count
-#define MAX_DURATION        (1 << 24)   // maximum duration
-#define MAX_POSITION_OFFSET (1 << 24)   // maximum frame/tile x/y offset
+#define MAX_LOOP_COUNT      (1 << 16)    // maximum value for loop-count
+#define MAX_DURATION        (1 << 24)    // maximum duration
+#define MAX_POSITION_OFFSET (1 << 24)    // maximum frame/tile x/y offset
 
 // Maximum chunk payload is such that adding the header and padding won't
 // overflow a uint32_t.
diff --git a/src/dec/io.c b/src/dec/io.c
index b90f6c5..c5746f7 100644
--- a/src/dec/io.c
+++ b/src/dec/io.c
@@ -184,38 +184,45 @@
   return 0;
 }
 
+static int GetAlphaSourceRow(const VP8Io* const io,
+                             const uint8_t** alpha, int* const num_rows) {
+  int start_y = io->mb_y;
+  *num_rows = io->mb_h;
+
+  // Compensate for the 1-line delay of the fancy upscaler.
+  // This is similar to EmitFancyRGB().
+  if (io->fancy_upsampling) {
+    if (start_y == 0) {
+      // We don't process the last row yet. It'll be done during the next call.
+      --*num_rows;
+    } else {
+      --start_y;
+      // Fortunately, *alpha data is persistent, so we can go back
+      // one row and finish alpha blending, now that the fancy upscaler
+      // completed the YUV->RGB interpolation.
+      *alpha -= io->width;
+    }
+    if (io->crop_top + io->mb_y + io->mb_h == io->crop_bottom) {
+      // If it's the very last call, we process all the remaing rows!
+      *num_rows = io->crop_bottom - io->crop_top - start_y;
+    }
+  }
+  return start_y;
+}
+
 static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
   const uint8_t* alpha = io->a;
   if (alpha != NULL) {
     const int mb_w = io->mb_w;
-    const int mb_h = io->mb_h;
     int i, j;
     const WEBP_CSP_MODE colorspace = p->output->colorspace;
     const int alpha_first =
         (colorspace == MODE_ARGB || colorspace == MODE_Argb);
     const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-    int start_y = io->mb_y;
-    int num_rows = mb_h;
+    int num_rows;
+    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
     uint32_t alpha_mask = 0xff;
 
-    // We compensate for the 1-line delay of fancy upscaler.
-    // This is similar to EmitFancyRGB().
-    if (io->fancy_upsampling) {
-      if (start_y == 0) {
-        // We don't process the last row yet. It'll be done during next call.
-        --num_rows;
-      } else {
-        --start_y;
-        // Fortunately, *alpha data is persistent, so we can go back
-        // one row and finish alpha blending, now that the fancy upscaler
-        // completed the YUV->RGB interpolation.
-        alpha -= io->width;
-      }
-      if (io->crop_top + io->mb_y + mb_h == io->crop_bottom) {
-        // If it's the very last call, we process all the remaing rows!
-        num_rows = io->crop_bottom - io->crop_top - start_y;
-      }
-    }
     {
       uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
       uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
@@ -242,24 +249,28 @@
   const uint8_t* alpha = io->a;
   if (alpha != NULL) {
     const int mb_w = io->mb_w;
-    const int mb_h = io->mb_h;
     int i, j;
     const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-    uint8_t* const base_rgba = buf->rgba + io->mb_y * buf->stride;
-    uint8_t* alpha_dst = base_rgba + 1;
+    int num_rows;
+    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
     uint32_t alpha_mask = 0x0f;
-    for (j = 0; j < mb_h; ++j) {
-      for (i = 0; i < mb_w; ++i) {
-        // Fill in the alpha value (converted to 4 bits).
-        const uint32_t alpha_value = VP8Clip4Bits(alpha[i]);
-        alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_value;
-        alpha_mask &= alpha_value;
+
+    {
+      uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
+      uint8_t* alpha_dst = base_rgba + 1;
+      for (j = 0; j < num_rows; ++j) {
+        for (i = 0; i < mb_w; ++i) {
+          // Fill in the alpha value (converted to 4 bits).
+          const uint32_t alpha_value = alpha[i] >> 4;
+          alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_value;
+          alpha_mask &= alpha_value;
+        }
+        alpha += io->width;
+        alpha_dst += buf->stride;
       }
-      alpha += io->width;
-      alpha_dst += buf->stride;
-    }
-    if (alpha_mask != 0x0f && p->output->colorspace == MODE_rgbA_4444) {
-      WebPApplyAlphaMultiply4444(base_rgba, mb_w, mb_h, buf->stride);
+      if (alpha_mask != 0x0f && p->output->colorspace == MODE_rgbA_4444) {
+        WebPApplyAlphaMultiply4444(base_rgba, mb_w, num_rows, buf->stride);
+      }
     }
   }
   return 0;
@@ -442,7 +453,7 @@
     WebPRescalerExportRow(&p->scaler_a);
     for (i = 0; i < width; ++i) {
       // Fill in the alpha value (converted to 4 bits).
-      const uint32_t alpha_value = VP8Clip4Bits(p->scaler_a.dst[i]);
+      const uint32_t alpha_value = p->scaler_a.dst[i] >> 4;
       alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_value;
       alpha_mask &= alpha_value;
     }
diff --git a/src/dsp/dsp.h b/src/dsp/dsp.h
index 8fd0752..3aad309 100644
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@@ -49,6 +49,8 @@
 //------------------------------------------------------------------------------
 // Encoding
 
+int VP8GetAlpha(const int histo[]);
+
 // Transforms
 // VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms
 //          will be done for (ref, in, dst) and (ref + 4, in + 16, dst + 4).
@@ -83,11 +85,10 @@
                                 int n, const struct VP8Matrix* const mtx);
 extern VP8QuantizeBlock VP8EncQuantizeBlock;
 
-// Collect histogram for susceptibility calculation and accumulate in histo[].
-struct VP8Histogram;
-typedef void (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
-                          int start_block, int end_block,
-                          struct VP8Histogram* const histo);
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+typedef int (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
+                         int start_block, int end_block);
 extern const int VP8DspScan[16 + 4 + 4];
 extern VP8CHisto VP8CollectHistogram;
 
diff --git a/src/dsp/enc.c b/src/dsp/enc.c
index 1bac3bf..0223456 100644
--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@@ -17,18 +17,31 @@
 extern "C" {
 #endif
 
-static WEBP_INLINE uint8_t clip_8b(int v) {
-  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
-}
-
-static WEBP_INLINE int clip_max(int v, int max) {
-  return (v > max) ? max : v;
-}
-
 //------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms:
 // the higher, the "easier" the macroblock is to compress.
 
+static int ClipAlpha(int alpha) {
+  return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
+}
+
+int VP8GetAlpha(const int histo[MAX_COEFF_THRESH + 1]) {
+  int num = 0, den = 0, val = 0;
+  int k;
+  int alpha;
+  // note: changing this loop to avoid the numerous "k + 1" slows things down.
+  for (k = 0; k < MAX_COEFF_THRESH; ++k) {
+    if (histo[k + 1]) {
+      val += histo[k + 1];
+      num += val * (k + 1);
+      den += (k + 1) * (k + 1);
+    }
+  }
+  // we scale the value to a usable [0..255] range
+  alpha = den ? 10 * num / den - 5 : 0;
+  return ClipAlpha(alpha);
+}
+
 const int VP8DspScan[16 + 4 + 4] = {
   // Luma
   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
@@ -40,23 +53,27 @@
   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
 };
 
-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                             int start_block, int end_block,
-                             VP8Histogram* const histo) {
-  int j;
+static int CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                            int start_block, int end_block) {
+  int histo[MAX_COEFF_THRESH + 1] = { 0 };
+  int16_t out[16];
+  int j, k;
   for (j = start_block; j < end_block; ++j) {
-    int k;
-    int16_t out[16];
-
     VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
 
-    // Convert coefficients to bin.
+    // Convert coefficients to bin (within out[]).
     for (k = 0; k < 16; ++k) {
-      const int v = abs(out[k]) >> 3;  // TODO(skal): add rounding?
-      const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
-      histo->distribution[clipped_value]++;
+      const int v = abs(out[k]) >> 2;
+      out[k] = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v;
+    }
+
+    // Use bin to update histogram.
+    for (k = 0; k < 16; ++k) {
+      histo[out[k]]++;
     }
   }
+
+  return VP8GetAlpha(histo);
 }
 
 //------------------------------------------------------------------------------
@@ -72,12 +89,15 @@
   if (!tables_ok) {
     int i;
     for (i = -255; i <= 255 + 255; ++i) {
-      clip1[255 + i] = clip_8b(i);
+      clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
     }
     tables_ok = 1;
   }
 }
 
+static WEBP_INLINE uint8_t clip_8b(int v) {
+  return (!(v & ~0xff)) ? v : v < 0 ? 0 : 255;
+}
 
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
diff --git a/src/dsp/enc_sse2.c b/src/dsp/enc_sse2.c
index f766211..b046761 100644
--- a/src/dsp/enc_sse2.c
+++ b/src/dsp/enc_sse2.c
@@ -25,15 +25,13 @@
 // Compute susceptibility based on DCT-coeff histograms:
 // the higher, the "easier" the macroblock is to compress.
 
-static void CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
-                                 int start_block, int end_block,
-                                 VP8Histogram* const histo) {
+static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
+                                int start_block, int end_block) {
+  int histo[MAX_COEFF_THRESH + 1] = { 0 };
+  int16_t out[16];
+  int j, k;
   const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
-  int j;
   for (j = start_block; j < end_block; ++j) {
-    int16_t out[16];
-    int k;
-
     VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
 
     // Convert coefficients to bin (within out[]).
@@ -49,9 +47,9 @@
       const __m128i xor1 = _mm_xor_si128(out1, sign1);
       const __m128i abs0 = _mm_sub_epi16(xor0, sign0);
       const __m128i abs1 = _mm_sub_epi16(xor1, sign1);
-      // v = abs(out) >> 3
-      const __m128i v0 = _mm_srai_epi16(abs0, 3);
-      const __m128i v1 = _mm_srai_epi16(abs1, 3);
+      // v = abs(out) >> 2
+      const __m128i v0 = _mm_srai_epi16(abs0, 2);
+      const __m128i v1 = _mm_srai_epi16(abs1, 2);
       // bin = min(v, MAX_COEFF_THRESH)
       const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
       const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
@@ -60,11 +58,13 @@
       _mm_storeu_si128((__m128i*)&out[8], bin1);
     }
 
-    // Convert coefficients to bin.
+    // Use bin to update histogram.
     for (k = 0; k < 16; ++k) {
-      histo->distribution[out[k]]++;
+      histo[out[k]]++;
     }
   }
+
+  return VP8GetAlpha(histo);
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/dsp/yuv.h b/src/dsp/yuv.h
index 29d7ea2..ee3587e 100644
--- a/src/dsp/yuv.h
+++ b/src/dsp/yuv.h
@@ -110,11 +110,6 @@
   rgba[3] = 0xff;
 }
 
-static WEBP_INLINE uint32_t VP8Clip4Bits(uint8_t c) {
-  const uint32_t v = (c + 8) >> 4;
-  return (v > 15) ? 15 : v;
-}
-
 // Must be called before everything, to initialize the tables.
 void VP8YUVInit(void);
 
diff --git a/src/enc/analysis.c b/src/enc/analysis.c
index a32fffc..22cfb49 100644
--- a/src/enc/analysis.c
+++ b/src/enc/analysis.c
@@ -23,6 +23,10 @@
 
 #define MAX_ITERS_K_MEANS  6
 
+static int ClipAlpha(int alpha) {
+  return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
+}
+
 //------------------------------------------------------------------------------
 // Smooth the segment map by replacing isolated block by the majority of its
 // neighbours.
@@ -111,7 +115,7 @@
 }
 
 static WEBP_INLINE int clip(int v, int m, int M) {
-  return (v < m) ? m : (v > M) ? M : v;
+  return v < m ? m : v > M ? M : v;
 }
 
 static void SetSegmentAlphas(VP8Encoder* const enc,
@@ -138,63 +142,22 @@
 }
 
 //------------------------------------------------------------------------------
-// Compute susceptibility based on DCT-coeff histograms:
-// the higher, the "easier" the macroblock is to compress.
-
-#define MAX_ALPHA 255                // 8b of precision for susceptibilities.
-#define ALPHA_SCALE (2 * MAX_ALPHA)  // scaling factor for alpha.
-#define DEFAULT_ALPHA (-1)
-#define IS_BETTER_ALPHA(alpha, best_alpha) ((alpha) > (best_alpha))
-
-static int FinalAlphaValue(int alpha) {
-  alpha = MAX_ALPHA - alpha;
-  return clip(alpha, 0, MAX_ALPHA);
-}
-
-static int GetAlpha(const VP8Histogram* const histo) {
-  int max_value = 0, last_non_zero = 1;
-  int k;
-  int alpha;
-  for (k = 0; k <= MAX_COEFF_THRESH; ++k) {
-    const int value = histo->distribution[k];
-    if (value > 0) {
-      if (value > max_value) max_value = value;
-      last_non_zero = k;
-    }
-  }
-  // 'alpha' will later be clipped to [0..MAX_ALPHA] range, clamping outer
-  // values which happen to be mostly noise. This leaves the maximum precision
-  // for handling the useful small values which contribute most.
-  alpha = (max_value > 1) ? ALPHA_SCALE * last_non_zero / max_value : 0;
-  return alpha;
-}
-
-static void MergeHistograms(const VP8Histogram* const in,
-                            VP8Histogram* const out) {
-  int i;
-  for (i = 0; i <= MAX_COEFF_THRESH; ++i) {
-    out->distribution[i] += in->distribution[i];
-  }
-}
-
-//------------------------------------------------------------------------------
 // Simplified k-Means, to assign Nb segments based on alpha-histogram
 
-static void AssignSegments(VP8Encoder* const enc,
-                           const int alphas[MAX_ALPHA + 1]) {
+static void AssignSegments(VP8Encoder* const enc, const int alphas[256]) {
   const int nb = enc->segment_hdr_.num_segments_;
   int centers[NUM_MB_SEGMENTS];
   int weighted_average = 0;
-  int map[MAX_ALPHA + 1];
+  int map[256];
   int a, n, k;
-  int min_a = 0, max_a = MAX_ALPHA, range_a;
+  int min_a = 0, max_a = 255, range_a;
   // 'int' type is ok for histo, and won't overflow
   int accum[NUM_MB_SEGMENTS], dist_accum[NUM_MB_SEGMENTS];
 
   // bracket the input
-  for (n = 0; n <= MAX_ALPHA && alphas[n] == 0; ++n) {}
+  for (n = 0; n < 256 && alphas[n] == 0; ++n) {}
   min_a = n;
-  for (n = MAX_ALPHA; n > min_a && alphas[n] == 0; --n) {}
+  for (n = 255; n > min_a && alphas[n] == 0; --n) {}
   max_a = n;
   range_a = max_a - min_a;
 
@@ -247,7 +210,7 @@
     VP8MBInfo* const mb = &enc->mb_info_[n];
     const int alpha = mb->alpha_;
     mb->segment_ = map[alpha];
-    mb->alpha_ = centers[map[alpha]];  // for the record.
+    mb->alpha_ = centers[map[alpha]];     // just for the record.
   }
 
   if (nb > 1) {
@@ -273,19 +236,15 @@
 static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
   const int max_mode = (it->enc_->method_ >= 3) ? MAX_INTRA16_MODE : 4;
   int mode;
-  int best_alpha = DEFAULT_ALPHA;
+  int best_alpha = -1;
   int best_mode = 0;
 
   VP8MakeLuma16Preds(it);
   for (mode = 0; mode < max_mode; ++mode) {
-    VP8Histogram histo = { { 0 } };
-    int alpha;
-
-    VP8CollectHistogram(it->yuv_in_ + Y_OFF,
-                        it->yuv_p_ + VP8I16ModeOffsets[mode],
-                        0, 16, &histo);
-    alpha = GetAlpha(&histo);
-    if (IS_BETTER_ALPHA(alpha, best_alpha)) {
+    const int alpha = VP8CollectHistogram(it->yuv_in_ + Y_OFF,
+                                          it->yuv_p_ + VP8I16ModeOffsets[mode],
+                                          0, 16);
+    if (alpha > best_alpha) {
       best_alpha = alpha;
       best_mode = mode;
     }
@@ -298,58 +257,45 @@
                                    int best_alpha) {
   uint8_t modes[16];
   const int max_mode = (it->enc_->method_ >= 3) ? MAX_INTRA4_MODE : NUM_BMODES;
-  int i4_alpha;
-  VP8Histogram total_histo = { { 0 } };
-  int cur_histo = 0;
-
+  int i4_alpha = 0;
   VP8IteratorStartI4(it);
   do {
     int mode;
-    int best_mode_alpha = DEFAULT_ALPHA;
-    VP8Histogram histos[2];
+    int best_mode_alpha = -1;
     const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
 
     VP8MakeIntra4Preds(it);
     for (mode = 0; mode < max_mode; ++mode) {
-      int alpha;
-
-      memset(&histos[cur_histo], 0, sizeof(histos[cur_histo]));
-      VP8CollectHistogram(src, it->yuv_p_ + VP8I4ModeOffsets[mode],
-                          0, 1, &histos[cur_histo]);
-      alpha = GetAlpha(&histos[cur_histo]);
-      if (IS_BETTER_ALPHA(alpha, best_mode_alpha)) {
+      const int alpha = VP8CollectHistogram(src,
+                                            it->yuv_p_ + VP8I4ModeOffsets[mode],
+                                            0, 1);
+      if (alpha > best_mode_alpha) {
         best_mode_alpha = alpha;
         modes[it->i4_] = mode;
-        cur_histo ^= 1;   // keep track of best histo so far.
       }
     }
-    // accumulate best histogram
-    MergeHistograms(&histos[cur_histo ^ 1], &total_histo);
+    i4_alpha += best_mode_alpha;
     // Note: we reuse the original samples for predictors
   } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
 
-  i4_alpha = GetAlpha(&total_histo);
-  if (IS_BETTER_ALPHA(i4_alpha, best_alpha)) {
+  if (i4_alpha > best_alpha) {
     VP8SetIntra4Mode(it, modes);
-    best_alpha = i4_alpha;
+    best_alpha = ClipAlpha(i4_alpha);
   }
   return best_alpha;
 }
 
 static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
-  int best_alpha = DEFAULT_ALPHA;
+  int best_alpha = -1;
   int best_mode = 0;
   const int max_mode = (it->enc_->method_ >= 3) ? MAX_UV_MODE : 4;
   int mode;
   VP8MakeChroma8Preds(it);
   for (mode = 0; mode < max_mode; ++mode) {
-    VP8Histogram histo = { { 0 } };
-    int alpha;
-    VP8CollectHistogram(it->yuv_in_ + U_OFF,
-                        it->yuv_p_ + VP8UVModeOffsets[mode],
-                        16, 16 + 4 + 4, &histo);
-    alpha = GetAlpha(&histo);
-    if (IS_BETTER_ALPHA(alpha, best_alpha)) {
+    const int alpha = VP8CollectHistogram(it->yuv_in_ + U_OFF,
+                                          it->yuv_p_ + VP8UVModeOffsets[mode],
+                                          16, 16 + 4 + 4);
+    if (alpha > best_alpha) {
       best_alpha = alpha;
       best_mode = mode;
     }
@@ -359,7 +305,7 @@
 }
 
 static void MBAnalyze(VP8EncIterator* const it,
-                      int alphas[MAX_ALPHA + 1], int* const uv_alpha) {
+                      int alphas[256], int* const uv_alpha) {
   const VP8Encoder* const enc = it->enc_;
   int best_alpha, best_uv_alpha;
 
@@ -378,11 +324,10 @@
   best_uv_alpha = MBAnalyzeBestUVMode(it);
 
   // Final susceptibility mix
-  best_alpha = (3 * best_alpha + best_uv_alpha + 2) >> 2;
-  best_alpha = FinalAlphaValue(best_alpha);
+  best_alpha = (best_alpha + best_uv_alpha + 1) / 2;
   alphas[best_alpha]++;
   *uv_alpha += best_uv_alpha;
-  it->mb_->alpha_ = best_alpha;   // for later remapping.
+  it->mb_->alpha_ = best_alpha;   // Informative only.
 }
 
 //------------------------------------------------------------------------------
@@ -397,7 +342,7 @@
 
 int VP8EncAnalyze(VP8Encoder* const enc) {
   int ok = 1;
-  int alphas[MAX_ALPHA + 1] = { 0 };
+  int alphas[256] = { 0 };
   VP8EncIterator it;
 
   VP8IteratorInit(enc, &it);
diff --git a/src/enc/frame.c b/src/enc/frame.c
index 262d84e..bdd3600 100644
--- a/src/enc/frame.c
+++ b/src/enc/frame.c
@@ -736,7 +736,6 @@
         const int b = (int)((it->luma_bits_ + it->uv_bits_ + 7) >> 3);
         *info = (b > 255) ? 255 : b; break;
       }
-      case 7: *info = mb->alpha_; break;
       default: *info = 0; break;
     };
   }
diff --git a/src/enc/picture.c b/src/enc/picture.c
index 518b01e..44eed06 100644
--- a/src/enc/picture.c
+++ b/src/enc/picture.c
@@ -908,135 +908,65 @@
 
 
 //------------------------------------------------------------------------------
-// local-min distortion
-//
-// For every pixel in the *reference* picture, we search for the local best
-// match in the compressed image. This is not a symmetrical measure.
-
-// search radius. Shouldn't be too large.
-#define RADIUS 2
-
-static double AccumulateLSIM(const uint8_t* src, int src_stride,
-                             const uint8_t* ref, int ref_stride,
-                             int w, int h) {
-  int x, y;
-  double total_sse = 0.;
-  for (y = 0; y < h; ++y) {
-    const int y0 = (y - RADIUS < 0) ? 0 : y - RADIUS;
-    const int y1 = (y + RADIUS + 1 >= h) ? h : y + RADIUS + 1;
-    for (x = 0; x < w; ++x) {
-      const int x0 = (x - RADIUS < 0) ? 0 : x - RADIUS;
-      const int x1 = (x + RADIUS + 1 >= w) ? w : x + RADIUS + 1;
-      double best_sse = 255. * 255.;
-      const double value = (double)ref[y * ref_stride + x];
-      int i, j;
-      for (j = y0; j < y1; ++j) {
-        const uint8_t* s = src + j * src_stride;
-        for (i = x0; i < x1; ++i) {
-          const double sse = (double)(s[i] - value) * (s[i] - value);
-          if (sse < best_sse) best_sse = sse;
-        }
-      }
-      total_sse += best_sse;
-    }
-  }
-  return total_sse;
-}
-#undef RADIUS
-
-//------------------------------------------------------------------------------
 // Distortion
 
 // Max value returned in case of exact similarity.
 static const double kMinDistortion_dB = 99.;
-static float GetPSNR(const double v) {
-  return (float)((v > 0.) ? -4.3429448 * log(v / (255 * 255.))
-                          : kMinDistortion_dB);
-}
 
-int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
+int WebPPictureDistortion(const WebPPicture* pic1, const WebPPicture* pic2,
                           int type, float result[5]) {
+  int c;
+  DistoStats stats[5];
   int has_alpha;
 
-  if (src == NULL || ref == NULL ||
-      src->width != ref->width || src->height != ref->height ||
-      src->y == NULL || ref->y == NULL ||
-      src->u == NULL || ref->u == NULL ||
-      src->v == NULL || ref->v == NULL ||
+  if (pic1 == NULL || pic2 == NULL ||
+      pic1->width != pic2->width || pic1->height != pic2->height ||
+      pic1->y == NULL || pic2->y == NULL ||
+      pic1->u == NULL || pic2->u == NULL ||
+      pic1->v == NULL || pic2->v == NULL ||
       result == NULL) {
     return 0;
   }
   // TODO(skal): provide distortion for ARGB too.
-  if (src->use_argb == 1 || src->use_argb != ref->use_argb) {
+  if (pic1->use_argb == 1 || pic1->use_argb != pic2->use_argb) {
     return 0;
   }
 
-  has_alpha = !!(src->colorspace & WEBP_CSP_ALPHA_BIT);
-  if (has_alpha != !!(ref->colorspace & WEBP_CSP_ALPHA_BIT) ||
-      (has_alpha && (src->a == NULL || ref->a == NULL))) {
+  has_alpha = !!(pic1->colorspace & WEBP_CSP_ALPHA_BIT);
+  if (has_alpha != !!(pic2->colorspace & WEBP_CSP_ALPHA_BIT) ||
+      (has_alpha && (pic1->a == NULL || pic2->a == NULL))) {
     return 0;
   }
 
-  if (type >= 2) {
-    float sse[4];
-    const int uv_w = HALVE(src->width);
-    const int uv_h = HALVE(src->height);
-    sse[0] = AccumulateLSIM(src->y, src->y_stride,
-                            ref->y, ref->y_stride, src->width, src->height);
-    sse[1] = AccumulateLSIM(src->u, src->uv_stride,
-                            ref->u, ref->uv_stride, uv_w, uv_h);
-    sse[2] = AccumulateLSIM(src->v, src->uv_stride,
-                            ref->v, ref->uv_stride, uv_w, uv_h);
-    sse[3] = has_alpha ? AccumulateLSIM(src->a, src->a_stride,
-                                        ref->a, ref->a_stride,
-                                        src->width, src->height)
-                       : 0;
-    result[0] = GetPSNR(sse[0] / (src->width * src->height));
-    result[1] = GetPSNR(sse[1] / (uv_w * uv_h));
-    result[2] = GetPSNR(sse[2] / (uv_w * uv_h));
-    result[3] = GetPSNR(sse[3] / (src->width * src->height));
-    {
-      double total_sse = sse[0] + sse[1] + sse[2];
-      int total_pixels = src->width * src->height + 2 * uv_w * uv_h;
-      if (has_alpha) {
-        total_pixels += src->width * src->height;
-        total_sse += sse[3];
-      }
-      result[4] = GetPSNR(total_sse / total_pixels);
+  memset(stats, 0, sizeof(stats));
+  VP8SSIMAccumulatePlane(pic1->y, pic1->y_stride,
+                         pic2->y, pic2->y_stride,
+                         pic1->width, pic1->height, &stats[0]);
+  VP8SSIMAccumulatePlane(pic1->u, pic1->uv_stride,
+                         pic2->u, pic2->uv_stride,
+                         (pic1->width + 1) >> 1, (pic1->height + 1) >> 1,
+                         &stats[1]);
+  VP8SSIMAccumulatePlane(pic1->v, pic1->uv_stride,
+                         pic2->v, pic2->uv_stride,
+                         (pic1->width + 1) >> 1, (pic1->height + 1) >> 1,
+                         &stats[2]);
+  if (has_alpha) {
+    VP8SSIMAccumulatePlane(pic1->a, pic1->a_stride,
+                           pic2->a, pic2->a_stride,
+                           pic1->width, pic1->height, &stats[3]);
+  }
+  for (c = 0; c <= 4; ++c) {
+    if (type == 1) {
+      const double v = VP8SSIMGet(&stats[c]);
+      result[c] = (float)((v < 1.) ? -10.0 * log10(1. - v)
+                                   : kMinDistortion_dB);
+    } else {
+      const double v = VP8SSIMGetSquaredError(&stats[c]);
+      result[c] = (float)((v > 0.) ? -4.3429448 * log(v / (255 * 255.))
+                                   : kMinDistortion_dB);
     }
-  } else {
-    int c;
-    DistoStats stats[5];
-
-    memset(stats, 0, sizeof(stats));
-    VP8SSIMAccumulatePlane(src->y, src->y_stride,
-                           ref->y, ref->y_stride,
-                           src->width, src->height, &stats[0]);
-    VP8SSIMAccumulatePlane(src->u, src->uv_stride,
-                           ref->u, ref->uv_stride,
-                           HALVE(src->width), HALVE(src->height),
-                           &stats[1]);
-    VP8SSIMAccumulatePlane(src->v, src->uv_stride,
-                           ref->v, ref->uv_stride,
-                           HALVE(src->width), HALVE(src->height),
-                           &stats[2]);
-    if (has_alpha) {
-      VP8SSIMAccumulatePlane(src->a, src->a_stride,
-                             ref->a, ref->a_stride,
-                             src->width, src->height, &stats[3]);
-    }
-    for (c = 0; c <= 4; ++c) {
-      if (type == 1) {
-        const double v = VP8SSIMGet(&stats[c]);
-        result[c] = (float)((v < 1.) ? -10.0 * log10(1. - v)
-                                     : kMinDistortion_dB);
-      } else {
-        const double v = VP8SSIMGetSquaredError(&stats[c]);
-        result[c] = GetPSNR(v);
-      }
-      // Accumulate forward
-      if (c < 4) VP8SSIMAddStats(&stats[c], &stats[4]);
-    }
+    // Accumulate forward
+    if (c < 4) VP8SSIMAddStats(&stats[c], &stats[4]);
   }
   return 1;
 }
diff --git a/src/enc/vp8enci.h b/src/enc/vp8enci.h
index c270d71..a0d9001 100644
--- a/src/enc/vp8enci.h
+++ b/src/enc/vp8enci.h
@@ -29,6 +29,8 @@
 #define ENC_MIN_VERSION 2
 #define ENC_REV_VERSION 0
 
+// size of histogram used by CollectHistogram.
+#define MAX_COEFF_THRESH   64
 
 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
@@ -160,14 +162,6 @@
 }
 extern const uint8_t VP8Zigzag[16];
 
-// size of histogram used by CollectHistogram.
-#define MAX_COEFF_THRESH   31
-typedef struct VP8Histogram VP8Histogram;
-struct VP8Histogram {
-  // TODO(skal): we only need to store the max_value and last_non_zero actually.
-  int distribution[MAX_COEFF_THRESH + 1];
-};
-
 //------------------------------------------------------------------------------
 // Headers
 
diff --git a/src/utils/huffman_encode.c b/src/utils/huffman_encode.c
index 3cd4b7f..2686c66 100644
--- a/src/utils/huffman_encode.c
+++ b/src/utils/huffman_encode.c
@@ -234,7 +234,7 @@
         tree_pool[tree_pool_size++] = tree[tree_size - 1];
         tree_pool[tree_pool_size++] = tree[tree_size - 2];
         count = tree_pool[tree_pool_size - 1].total_count_ +
-                tree_pool[tree_pool_size - 2].total_count_;
+            tree_pool[tree_pool_size - 2].total_count_;
         tree_size -= 2;
         {
           // Search for the insertion point.