Merge "Skia Merge (revision 3312)"
diff --git a/Android.mk b/Android.mk
index 1282c7f..f1e81d2 100644
--- a/Android.mk
+++ b/Android.mk
@@ -79,12 +79,14 @@
 	src/core/SkColorTable.cpp \
 	src/core/SkComposeShader.cpp \
 	src/core/SkConcaveToTriangles.cpp \
+	src/core/SkConfig8888.cpp \
 	src/core/SkCordic.cpp \
 	src/core/SkCubicClipper.cpp \
 	src/core/SkData.cpp \
 	src/core/SkDebug.cpp \
 	src/core/SkDeque.cpp \
 	src/core/SkDevice.cpp \
+	src/core/SkDeviceProfile.cpp \
 	src/core/SkDither.cpp \
 	src/core/SkDraw.cpp \
   src/core/SkEdgeBuilder.cpp \
@@ -172,6 +174,7 @@
 	src/effects/SkKernel33MaskFilter.cpp \
 	src/effects/SkLayerDrawLooper.cpp \
 	src/effects/SkLayerRasterizer.cpp \
+	src/effects/SkMorphologyImageFilter.cpp \
 	src/effects/SkPaintFlagsDrawFilter.cpp \
 	src/effects/SkPixelXorXfermode.cpp \
 	src/effects/SkPorterDuff.cpp \
@@ -215,12 +218,15 @@
 	src/ports/SkImageRef_ashmem.cpp \
 	src/ports/SkMemory_malloc.cpp \
 	src/ports/SkOSFile_stdio.cpp \
+	src/ports/SkThread_pthread.cpp \
 	src/ports/SkTime_Unix.cpp \
+	src/utils/SkBase64.cpp \
 	src/utils/SkBoundaryPatch.cpp \
 	src/utils/SkCamera.cpp \
 	src/utils/SkColorMatrix.cpp \
   src/utils/SkCubicInterval.cpp \
 	src/utils/SkCullPoints.cpp \
+	src/utils/SkDeferredCanvas.cpp \
 	src/utils/SkDumpCanvas.cpp \
 	src/utils/SkInterpolator.cpp \
 	src/utils/SkLayer.cpp \
@@ -320,18 +326,19 @@
 
 LOCAL_SRC_FILES:= \
   src/gpu/GrPrintf_skia.cpp \
-  src/gpu/SkGLContext.cpp \
   src/gpu/SkGpuCanvas.cpp \
   src/gpu/SkGpuDevice.cpp \
   src/gpu/SkGr.cpp \
   src/gpu/SkGrFontScaler.cpp \
   src/gpu/SkGrTexturePixelRef.cpp \
-  src/gpu/SkNullGLContext.cpp \
-  src/gpu/android/SkNativeGLContext_android.cpp
+  src/gpu/android/SkNativeGLContext_android.cpp \
+  src/gpu/gl/SkGLContext.cpp \
+  src/gpu/gl/SkNullGLContext.cpp
 
 LOCAL_SRC_FILES += \
   src/gpu/GrAAHairLinePathRenderer.cpp \
-  src/gpu/GrAddPathRenderers_aahairline.cpp \
+  src/gpu/GrAAConvexPathRenderer.cpp \
+  src/gpu/GrAddPathRenderers_default.cpp \
   src/gpu/GrAllocPool.cpp \
   src/gpu/GrAtlas.cpp \
   src/gpu/GrBufferAllocPool.cpp \
@@ -339,21 +346,8 @@
   src/gpu/GrContext.cpp \
   src/gpu/GrDefaultPathRenderer.cpp \
   src/gpu/GrDrawTarget.cpp \
-  src/gpu/GrGLCreateNullInterface.cpp \
-  src/gpu/GrGLDefaultInterface_native.cpp \
-  src/gpu/GrGLIndexBuffer.cpp \
-  src/gpu/GrGLInterface.cpp \
-  src/gpu/GrGLProgram.cpp \
-  src/gpu/GrGLRenderTarget.cpp \
-  src/gpu/GrGLSL.cpp \
-  src/gpu/GrGLStencilBuffer.cpp \
-  src/gpu/GrGLTexture.cpp \
-  src/gpu/GrGLUtil.cpp \
-  src/gpu/GrGLVertexBuffer.cpp \
   src/gpu/GrGpu.cpp \
   src/gpu/GrGpuFactory.cpp \
-  src/gpu/GrGpuGL.cpp \
-  src/gpu/GrGpuGLShaders.cpp \
   src/gpu/GrInOrderDrawBuffer.cpp \
   src/gpu/GrMatrix.cpp \
   src/gpu/GrMemory.cpp \
@@ -372,6 +366,23 @@
   src/gpu/GrTexture.cpp \
   src/gpu/gr_unittests.cpp \
   src/gpu/android/GrGLCreateNativeInterface_android.cpp
+
+LOCAL_SRC_FILES += \
+  src/gpu/gl/GrGLCaps.cpp \
+  src/gpu/gl/GrGLContextInfo.cpp \
+  src/gpu/gl/GrGLCreateNullInterface.cpp \
+  src/gpu/gl/GrGLDefaultInterface_native.cpp \
+  src/gpu/gl/GrGLIndexBuffer.cpp \
+  src/gpu/gl/GrGLInterface.cpp \
+  src/gpu/gl/GrGLProgram.cpp \
+  src/gpu/gl/GrGLRenderTarget.cpp \
+  src/gpu/gl/GrGLSL.cpp \
+  src/gpu/gl/GrGLStencilBuffer.cpp \
+  src/gpu/gl/GrGLTexture.cpp \
+  src/gpu/gl/GrGLUtil.cpp \
+  src/gpu/gl/GrGLVertexBuffer.cpp \
+  src/gpu/gl/GrGpuGL.cpp \
+  src/gpu/gl/GrGpuGLShaders.cpp
   
 LOCAL_STATIC_LIBRARIES := libskiatess
 LOCAL_SHARED_LIBRARIES := \
diff --git a/bench/Android.mk b/bench/Android.mk
index ab2e588..71a44b9 100644
--- a/bench/Android.mk
+++ b/bench/Android.mk
@@ -21,6 +21,7 @@
   MatrixBench.cpp \
   MutexBench.cpp \
   PathBench.cpp \
+  PicturePlaybackBench.cpp \
   RectBench.cpp \
   RepeatTileBench.cpp \
   ScalarBench.cpp \
diff --git a/bench/BenchGpuTimer_gl.cpp b/bench/BenchGpuTimer_gl.cpp
index 885f7b2..b7bd88b 100644
--- a/bench/BenchGpuTimer_gl.cpp
+++ b/bench/BenchGpuTimer_gl.cpp
@@ -6,7 +6,7 @@
  * found in the LICENSE file.
  */
 #include "BenchGpuTimer_gl.h"
-#include "SkGLContext.h"
+#include "gl/SkGLContext.h"
 
 BenchGpuTimer::BenchGpuTimer(const SkGLContext* glctx) {
     fContext = glctx;
diff --git a/bench/MutexBench.cpp b/bench/MutexBench.cpp
index d9b427b..af8a840 100644
--- a/bench/MutexBench.cpp
+++ b/bench/MutexBench.cpp
@@ -23,7 +23,7 @@
 
     virtual void onDraw(SkCanvas* canvas) {
         for (int i = 0; i < N; i++) {
-            SkMutex mu;
+            SK_DECLARE_STATIC_MUTEX(mu);
             for (int j = 0; j < M; j++) {
                 mu.acquire();
                 mu.release();
diff --git a/bench/PathBench.cpp b/bench/PathBench.cpp
index d3e01b7..f9de53c 100644
--- a/bench/PathBench.cpp
+++ b/bench/PathBench.cpp
@@ -182,6 +182,26 @@
     typedef PathBench INHERITED;
 };
 
+class LongLinePathBench : public PathBench {
+public:
+    LongLinePathBench(void * param, Flags flags)
+        : INHERITED(param, flags) {
+    }
+
+    virtual void appendName(SkString* name) {
+        name->append("long_line");
+    }
+    virtual void makePath(SkPath* path) {
+        SkRandom rand;
+        path->moveTo(rand.nextUScalar1() * 640, rand.nextUScalar1() * 480);
+        for (size_t i = 1; i < 100; i++) {
+            path->lineTo(rand.nextUScalar1() * 640, rand.nextUScalar1() * 480);
+        }
+    }
+    virtual int complexity() { return 2; }
+private:
+    typedef PathBench INHERITED;
+};
 
 
 static SkBenchmark* FactT00(void* p) { return new TrianglePathBench(p, FLAGS00); }
@@ -209,6 +229,14 @@
     return new LongCurvedPathBench(p, FLAGS01);
 }
 
+static SkBenchmark* FactLL00(void* p) {
+    return new LongLinePathBench(p, FLAGS00);
+}
+
+static SkBenchmark* FactLL01(void* p) {
+    return new LongLinePathBench(p, FLAGS01);
+}
+
 static BenchRegistry gRegT00(FactT00);
 static BenchRegistry gRegT01(FactT01);
 static BenchRegistry gRegT10(FactT10);
@@ -230,3 +258,6 @@
 static BenchRegistry gRegLC00(FactLC00);
 static BenchRegistry gRegLC01(FactLC01);
 
+static BenchRegistry gRegLL00(FactLL00);
+static BenchRegistry gRegLL01(FactLL01);
+
diff --git a/bench/PicturePlaybackBench.cpp b/bench/PicturePlaybackBench.cpp
new file mode 100644
index 0000000..6a07fb5
--- /dev/null
+++ b/bench/PicturePlaybackBench.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright 2011 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+#include "SkBenchmark.h"
+#include "SkCanvas.h"
+#include "SkColor.h"
+#include "SkPaint.h"
+#include "SkPicture.h"
+#include "SkPoint.h"
+#include "SkRect.h"
+#include "SkString.h"
+
+// This is designed to emulate about 4 screens of textual content
+
+
+class PicturePlaybackBench : public SkBenchmark {
+public:
+    PicturePlaybackBench(void* param, const char name[]) : INHERITED(param) {
+        fName.printf("picture_playback_%s", name);
+        fPictureWidth = SkIntToScalar(PICTURE_WIDTH);
+        fPictureHeight = SkIntToScalar(PICTURE_HEIGHT);
+        fTextSize = SkIntToScalar(TEXT_SIZE);
+    }
+
+    enum {
+        N = SkBENCHLOOP(1000),   // number of times to playback the picture
+        PICTURE_WIDTH = 1000,
+        PICTURE_HEIGHT = 4000,
+        TEXT_SIZE = 10
+    };
+protected:
+    virtual const char* onGetName() {
+        return fName.c_str();
+    }
+
+    virtual void onDraw(SkCanvas* canvas) {
+
+        SkPicture picture;
+
+        SkCanvas* pCanvas = picture.beginRecording(PICTURE_WIDTH, PICTURE_HEIGHT);
+        recordCanvas(pCanvas);
+        picture.endRecording();
+
+        const SkPoint translateDelta = getTranslateDelta();
+
+        for (int i = 0; i < N; i++) {
+            picture.draw(canvas);
+            canvas->translate(translateDelta.fX, translateDelta.fY);
+        }
+    }
+
+    virtual void recordCanvas(SkCanvas* canvas) = 0;
+    virtual SkPoint getTranslateDelta() {
+        SkIPoint canvasSize = onGetSize();
+        return SkPoint::Make(SkIntToScalar((PICTURE_WIDTH - canvasSize.fX)/N),
+                             SkIntToScalar((PICTURE_HEIGHT- canvasSize.fY)/N));
+    }
+
+    SkString fName;
+    SkScalar fPictureWidth;
+    SkScalar fPictureHeight;
+    SkScalar fTextSize;
+private:
+    typedef SkBenchmark INHERITED;
+};
+
+
+class TextPlaybackBench : public PicturePlaybackBench {
+public:
+    TextPlaybackBench(void* param) : INHERITED(param, "drawText") { }
+protected:
+    virtual void recordCanvas(SkCanvas* canvas) {
+        SkPaint paint;
+        paint.setTextSize(fTextSize);
+        paint.setColor(SK_ColorBLACK);
+
+        const char* text = "Hamburgefons";
+        size_t len = strlen(text);
+        const SkScalar textWidth = paint.measureText(text, len);
+
+        for (SkScalar x = 0; x < fPictureWidth; x += textWidth) {
+            for (SkScalar y = 0; y < fPictureHeight; y += fTextSize) {
+                canvas->drawText(text, len, x, y, paint);
+            }
+        }
+    }
+private:
+    typedef PicturePlaybackBench INHERITED;
+};
+
+class PosTextPlaybackBench : public PicturePlaybackBench {
+public:
+    PosTextPlaybackBench(void* param, bool drawPosH)
+        : INHERITED(param, drawPosH ? "drawPosTextH" : "drawPosText")
+        , fDrawPosH(drawPosH) { }
+protected:
+    virtual void recordCanvas(SkCanvas* canvas) {
+        SkPaint paint;
+        paint.setTextSize(fTextSize);
+        paint.setColor(SK_ColorBLACK);
+
+        const char* text = "Hamburgefons";
+        size_t len = strlen(text);
+        const SkScalar textWidth = paint.measureText(text, len);
+
+        SkScalar* adv = new SkScalar[len];
+        paint.getTextWidths(text, len, adv);
+
+        for (SkScalar x = 0; x < fPictureWidth; x += textWidth) {
+            for (SkScalar y = 0; y < fPictureHeight; y += fTextSize) {
+
+                SkPoint* pos = new SkPoint[len];
+                SkScalar advX = 0;
+
+                for (size_t i = 0; i < len; i++) {
+                    if (fDrawPosH)
+                        pos[i].set(x + advX, y);
+                    else
+                        pos[i].set(x + advX, y + SkIntToScalar(i));
+                    advX += adv[i];
+                }
+
+                canvas->drawPosText(text, len, pos, paint);
+                delete[] pos;
+            }
+        }
+        delete[] adv;
+    }
+private:
+    bool fDrawPosH;
+    typedef PicturePlaybackBench INHERITED;
+};
+
+
+///////////////////////////////////////////////////////////////////////////////
+
+static SkBenchmark* Fact0(void* p) { return new TextPlaybackBench(p); }
+static SkBenchmark* Fact1(void* p) { return new PosTextPlaybackBench(p, true); }
+static SkBenchmark* Fact2(void* p) { return new PosTextPlaybackBench(p, false); }
+
+static BenchRegistry gReg0(Fact0);
+static BenchRegistry gReg1(Fact1);
+static BenchRegistry gReg2(Fact2);
+
diff --git a/bench/TextBench.cpp b/bench/TextBench.cpp
index 63a7167..ed8fb0e 100644
--- a/bench/TextBench.cpp
+++ b/bench/TextBench.cpp
@@ -81,13 +81,13 @@
     virtual const char* onGetName() {
         fName.printf("text_%g", SkScalarToFloat(fPaint.getTextSize()));
         if (fDoPos) {
-            fName.appendf("_pos");
+            fName.append("_pos");
         }
         fName.appendf("_%s", fontQualityName(fPaint));
         if (SK_ColorBLACK != fPaint.getColor()) {
             fName.appendf("_%02X", fPaint.getAlpha());
         } else {
-            fName.appendf("_BK");
+            fName.append("_BK");
         }
         return fName.c_str();
     }
diff --git a/bench/benchmain.cpp b/bench/benchmain.cpp
index 024ad0f..7732268 100644
--- a/bench/benchmain.cpp
+++ b/bench/benchmain.cpp
@@ -18,8 +18,8 @@
 #include "SkGpuDevice.h"
 #include "SkGraphics.h"
 #include "SkImageEncoder.h"
-#include "SkNativeGLContext.h"
-#include "SkNullGLContext.h"
+#include "gl/SkNativeGLContext.h"
+#include "gl/SkNullGLContext.h"
 #include "SkNWayCanvas.h"
 #include "SkPicture.h"
 #include "SkString.h"
diff --git a/gm/Android.mk b/gm/Android.mk
index 14f4d62..8dcb0ef 100644
--- a/gm/Android.mk
+++ b/gm/Android.mk
@@ -3,33 +3,59 @@
 include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES := \
+  gm.cpp \
+  gmmain.cpp \
+  system_preferences_default.cpp
+
+# Slides
+LOCAL_SRC_FILES += \
+  aaclip.cpp \
   aarectmodes.cpp \
+  arithmode.cpp \
+  bitmapcopy.cpp \
   bitmapfilters.cpp \
   bitmapscroll.cpp \
   blurs.cpp \
+  colormatrix.cpp \
   complexclip.cpp \
   complexclip2.cpp \
+  convexpaths.cpp \
+  cubicpaths.cpp \
+  degeneratesegments.cpp \
+  drawbitmaprect.cpp \
   emptypath.cpp \
   filltypes.cpp \
   filltypespersp.cpp \
-  gm.cpp \
-  gmmain.cpp \
+  fontscaler.cpp \
+  gammatext.cpp \
   gradients.cpp \
+  gradtext.cpp \
   hairmodes.cpp \
+  imageblur.cpp \
   lcdtext.cpp \
+  linepaths.cpp \
+  morphology.cpp \
   ninepatchstretch.cpp \
   nocolorbleed.cpp \
+  patheffects.cpp \
   pathfill.cpp \
+  pathreverse.cpp \
   points.cpp \
   poly2poly.cpp \
+  quadpaths.cpp \
   shadertext.cpp \
   shadows.cpp \
   shapes.cpp \
+  strokefill.cpp \
   strokerects.cpp \
   strokes.cpp \
+  tablecolorfilter.cpp \
+  testimagefilters.cpp \
   texdata.cpp \
   tilemodes.cpp \
   tinybitmap.cpp \
+  verttext.cpp \
+  verttext2.cpp \
   xfermodes.cpp
 
 LOCAL_STATIC_LIBRARIES := libskiagpu
diff --git a/gm/convexpaths.cpp b/gm/convexpaths.cpp
new file mode 100644
index 0000000..2c719e8
--- /dev/null
+++ b/gm/convexpaths.cpp
@@ -0,0 +1,202 @@
+
+/*
+ * Copyright 2011 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+#include "gm.h"
+#include "SkRandom.h"
+#include "SkTArray.h"
+
+namespace skiagm {
+
+class ConvexPathsGM : public GM {
+public:
+    ConvexPathsGM() {
+        this->setBGColor(0xFF000000);
+        this->makePaths();
+    }
+
+protected:
+    virtual SkString onShortName() {
+        return SkString("convexpaths");
+    }
+
+
+    virtual SkISize onISize() {
+        return make_isize(1200, 900);
+    }
+
+    void makePaths() {
+        // CW
+        fPaths.push_back().moveTo(0, 0);
+        fPaths.back().quadTo(50 * SK_Scalar1, 100 * SK_Scalar1,
+                             0, 100 * SK_Scalar1);
+        fPaths.back().lineTo(0, 0);
+
+        // CCW
+        fPaths.push_back().moveTo(0, 0);
+        fPaths.back().lineTo(0, 100 * SK_Scalar1);
+        fPaths.back().quadTo(50 * SK_Scalar1, 100 * SK_Scalar1,
+                             0, 0);
+
+        // CW
+        fPaths.push_back().moveTo(0, 50 * SK_Scalar1);
+        fPaths.back().quadTo(50 * SK_Scalar1, 0,
+                             100 * SK_Scalar1, 50 * SK_Scalar1);
+        fPaths.back().quadTo(50 * SK_Scalar1, 100 * SK_Scalar1,
+                             0, 50 * SK_Scalar1);
+
+        // CCW
+        fPaths.push_back().moveTo(0, 50 * SK_Scalar1);
+        fPaths.back().quadTo(50 * SK_Scalar1, 100 * SK_Scalar1,
+                             100 * SK_Scalar1, 50 * SK_Scalar1);
+        fPaths.back().quadTo(50 * SK_Scalar1, 0,
+                             0, 50 * SK_Scalar1);
+
+        fPaths.push_back().addRect(0, 0,
+                                   100 * SK_Scalar1, 100 * SK_Scalar1,
+                                   SkPath::kCW_Direction);
+
+        fPaths.push_back().addRect(0, 0,
+                                   100 * SK_Scalar1, 100 * SK_Scalar1,
+                                   SkPath::kCCW_Direction);
+
+        fPaths.push_back().addCircle(50  * SK_Scalar1, 50  * SK_Scalar1,
+                                     50  * SK_Scalar1, SkPath::kCW_Direction);
+
+        fPaths.push_back().addCircle(50  * SK_Scalar1, 50  * SK_Scalar1,
+                                     40  * SK_Scalar1, SkPath::kCCW_Direction);
+
+        fPaths.push_back().addOval(SkRect::MakeXYWH(0, 0,
+                                                    50 * SK_Scalar1,
+                                                    100 * SK_Scalar1),
+                                   SkPath::kCW_Direction);
+
+        fPaths.push_back().addOval(SkRect::MakeXYWH(0, 0,
+                                                    100 * SK_Scalar1,
+                                                    50 * SK_Scalar1),
+                                   SkPath::kCCW_Direction);
+
+        fPaths.push_back().addOval(SkRect::MakeXYWH(0, 0,
+                                                    100 * SK_Scalar1,
+                                                    5 * SK_Scalar1),
+                                   SkPath::kCCW_Direction);
+
+        fPaths.push_back().addOval(SkRect::MakeXYWH(0, 0,
+                                                    SK_Scalar1,
+                                                    100 * SK_Scalar1),
+                                   SkPath::kCCW_Direction);
+
+        fPaths.push_back().addRoundRect(SkRect::MakeXYWH(0, 0,
+                                                         SK_Scalar1 * 100,
+                                                         SK_Scalar1 * 100),
+                                        40 * SK_Scalar1, 20 * SK_Scalar1,
+                                        SkPath::kCW_Direction);
+
+        fPaths.push_back().addRoundRect(SkRect::MakeXYWH(0, 0,
+                                                         SK_Scalar1 * 100,
+                                                         SK_Scalar1 * 100),
+                                        20 * SK_Scalar1, 40 * SK_Scalar1,
+                                        SkPath::kCCW_Direction);
+
+        // shallow diagonals
+        fPaths.push_back().lineTo(100 * SK_Scalar1, SK_Scalar1);
+        fPaths.back().lineTo(98 * SK_Scalar1, 100 * SK_Scalar1);
+        fPaths.back().lineTo(3 * SK_Scalar1, 96 * SK_Scalar1);
+
+        /*
+        It turns out arcTos are not automatically marked as convex and they
+        may in fact be ever so slightly concave.
+        fPaths.push_back().arcTo(SkRect::MakeXYWH(0, 0,
+                                                  50 * SK_Scalar1,
+                                                  100 * SK_Scalar1),
+                                 25 * SK_Scalar1,  130 * SK_Scalar1, false);
+        */
+
+        // cubics
+        fPaths.push_back().cubicTo( 1 * SK_Scalar1,  1 * SK_Scalar1,
+                                   10 * SK_Scalar1,  90 * SK_Scalar1,
+                                    0 * SK_Scalar1, 100 * SK_Scalar1);
+        fPaths.push_back().cubicTo(100 * SK_Scalar1,  50 * SK_Scalar1,
+                                    20 * SK_Scalar1, 100 * SK_Scalar1,
+                                     0 * SK_Scalar1,   0 * SK_Scalar1);
+
+        // triangle where one edge is a degenerate quad
+        fPaths.push_back().moveTo(SkFloatToScalar(8.59375f), 45 * SK_Scalar1);
+        fPaths.back().quadTo(SkFloatToScalar(16.9921875f),   45 * SK_Scalar1,
+                             SkFloatToScalar(31.25f),        45 * SK_Scalar1);
+        fPaths.back().lineTo(100 * SK_Scalar1,              100 * SK_Scalar1);
+        fPaths.back().lineTo(SkFloatToScalar(8.59375f),      45 * SK_Scalar1);
+
+        // point degenerate
+        fPaths.push_back().moveTo(50 * SK_Scalar1, 50 * SK_Scalar1);
+        fPaths.back().lineTo(50 * SK_Scalar1, 50 * SK_Scalar1);
+        
+        fPaths.push_back().moveTo(50 * SK_Scalar1, 50 * SK_Scalar1);
+        fPaths.back().quadTo(50 * SK_Scalar1, 50 * SK_Scalar1,
+                             50 * SK_Scalar1, 50 * SK_Scalar1);
+        fPaths.push_back().moveTo(50 * SK_Scalar1, 50 * SK_Scalar1);
+        fPaths.back().cubicTo(50 * SK_Scalar1, 50 * SK_Scalar1,
+                              50 * SK_Scalar1, 50 * SK_Scalar1,
+                              50 * SK_Scalar1, 50 * SK_Scalar1);
+
+        // moveTo only paths
+        fPaths.push_back().moveTo(0, 0);
+        fPaths.back().moveTo(0, 0);
+        fPaths.back().moveTo(SK_Scalar1, SK_Scalar1);
+        fPaths.back().moveTo(SK_Scalar1, SK_Scalar1);
+        fPaths.back().moveTo(10 * SK_Scalar1, 10 * SK_Scalar1);
+
+        fPaths.push_back().moveTo(0, 0);
+        fPaths.back().moveTo(0, 0);
+
+        // line degenerate
+        fPaths.push_back().lineTo(100 * SK_Scalar1, 100 * SK_Scalar1);
+        fPaths.push_back().quadTo(100 * SK_Scalar1, 100 * SK_Scalar1, 0, 0);
+        fPaths.push_back().quadTo(100 * SK_Scalar1, 100 * SK_Scalar1,
+                                  50 * SK_Scalar1, 50 * SK_Scalar1);
+        fPaths.push_back().quadTo(50 * SK_Scalar1, 50 * SK_Scalar1,
+                                  100 * SK_Scalar1, 100 * SK_Scalar1);
+        fPaths.push_back().cubicTo(0, 0,
+                                   0, 0,
+                                   100 * SK_Scalar1, 100 * SK_Scalar1);
+
+        // small circle. This is listed last so that it has device coords far
+        // from the origin (small area relative to x,y values).
+        fPaths.push_back().addCircle(0, 0, SkFloatToScalar(0.8f));
+    }
+
+    virtual void onDraw(SkCanvas* canvas) {
+
+    SkPaint paint;
+    paint.setAntiAlias(true);
+    SkRandom rand;
+    canvas->translate(20 * SK_Scalar1, 20 * SK_Scalar1);
+    for (int i = 0; i < fPaths.count(); ++i) {
+        canvas->save();
+        // position the path, and make it at off-integer coords.
+        canvas->translate(SK_Scalar1 * 200 * (i % 5) + SK_Scalar1 / 4,
+                          SK_Scalar1 * 200 * (i / 5) + 3 * SK_Scalar1 / 4);
+        SkColor color = rand.nextU();
+        color |= 0xff000000;
+        paint.setColor(color);
+        SkASSERT(fPaths[i].isConvex());
+        canvas->drawPath(fPaths[i], paint);
+        canvas->restore();
+    }
+    }
+    
+private:
+    typedef GM INHERITED;
+    SkTArray<SkPath> fPaths;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+
+static GM* MyFactory(void*) { return new ConvexPathsGM; }
+static GMRegistry reg(MyFactory);
+
+}
+
diff --git a/gm/gammatext.cpp b/gm/gammatext.cpp
new file mode 100644
index 0000000..6f1c298
--- /dev/null
+++ b/gm/gammatext.cpp
@@ -0,0 +1,192 @@
+/*
+ * Copyright 2011 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "gm.h"
+#include "SkCanvas.h"
+#include "SkPath.h"
+#include "SkGradientShader.h"
+#include "SkTypeface.h"
+
+static SkShader* make_heatGradient(const SkPoint pts[2]) {
+    const SkColor colors[] = {
+        SK_ColorBLACK, SK_ColorBLUE, SK_ColorCYAN, SK_ColorGREEN,
+        SK_ColorYELLOW, SK_ColorRED, SK_ColorWHITE
+    };
+    const SkColor bw[] = { SK_ColorBLACK, SK_ColorWHITE };
+
+    return SkGradientShader::CreateLinear(pts, bw, NULL,
+                                          SK_ARRAY_COUNT(bw),
+                                          SkShader::kClamp_TileMode);
+}
+
+static bool setFont(SkPaint* paint, const char name[]) {
+    SkTypeface* tf = SkTypeface::CreateFromName(name, SkTypeface::kNormal);
+    if (tf) {
+        paint->setTypeface(tf)->unref();
+        return true;
+    }
+    return false;
+}
+
+#ifdef SK_BUILD_FOR_MAC
+#import <ApplicationServices/ApplicationServices.h>
+#define BITMAP_INFO_RGB     (kCGImageAlphaNoneSkipFirst | kCGBitmapByteOrder32Host)
+
+static CGContextRef makeCG(const SkBitmap& bm) {
+    if (SkBitmap::kARGB_8888_Config != bm.config() ||
+        NULL == bm.getPixels()) {
+        return NULL;
+    }
+    CGColorSpaceRef space = CGColorSpaceCreateDeviceRGB();
+    CGContextRef cg = CGBitmapContextCreate(bm.getPixels(), bm.width(), bm.height(),
+                                            8, bm.rowBytes(), space, BITMAP_INFO_RGB);
+    CFRelease(space);
+    
+    CGContextSetAllowsFontSubpixelQuantization(cg, false);
+    CGContextSetShouldSubpixelQuantizeFonts(cg, false);
+    
+    return cg;
+}
+
+extern CTFontRef SkTypeface_GetCTFontRef(const SkTypeface* face);
+
+static CGFontRef typefaceToCGFont(const SkTypeface* face) {
+    if (NULL == face) {
+        return 0;
+    }
+
+    CTFontRef ct = SkTypeface_GetCTFontRef(face);
+    return CTFontCopyGraphicsFont(ct, NULL);
+}
+
+static void cgSetPaintForText(CGContextRef cg, const SkPaint& paint) {
+    SkColor c = paint.getColor();
+    CGFloat rgba[] = {
+        SkColorGetB(c) / 255.0,
+        SkColorGetG(c) / 255.0,
+        SkColorGetR(c) / 255.0,
+        SkColorGetA(c) / 255.0,
+    };
+    CGContextSetRGBFillColor(cg, rgba[0], rgba[1], rgba[2], rgba[3]);
+
+    CGContextSetTextDrawingMode(cg, kCGTextFill);
+    CGContextSetFont(cg, typefaceToCGFont(paint.getTypeface()));
+    CGContextSetFontSize(cg, SkScalarToFloat(paint.getTextSize()));
+
+    CGContextSetAllowsFontSubpixelPositioning(cg, paint.isSubpixelText());
+    CGContextSetShouldSubpixelPositionFonts(cg, paint.isSubpixelText());
+    
+    CGContextSetShouldAntialias(cg, paint.isAntiAlias());
+    CGContextSetShouldSmoothFonts(cg, paint.isLCDRenderText());
+}
+
+static void cgDrawText(CGContextRef cg, const void* text, size_t len,
+                       float x, float y, const SkPaint& paint) {
+    if (cg) {
+        cgSetPaintForText(cg, paint);
+
+        uint16_t glyphs[200];
+        int count = paint.textToGlyphs(text, len, glyphs);
+
+        CGContextShowGlyphsAtPoint(cg, x, y, glyphs, count);
+    }
+}
+#endif
+
+namespace skiagm {
+
+/**
+   Test a set of clipping problems discovered while writing blitAntiRect,
+   and test all the code paths through the clipping blitters.
+   Each region should show as a blue center surrounded by a 2px green
+   border, with no red.
+*/
+    
+#define HEIGHT 480
+
+class GammaTextGM : public GM {
+public:
+    GammaTextGM() {
+
+    }
+
+protected:
+    virtual SkString onShortName() {
+        return SkString("gammatext");
+    }
+
+    virtual SkISize onISize() {
+        return make_isize(1024, HEIGHT);
+    }
+
+    static void drawGrad(SkCanvas* canvas) {
+        SkPoint pts[] = { { 0, 0 }, { 0, HEIGHT } };
+#if 0
+        const SkColor colors[] = { SK_ColorBLACK, SK_ColorWHITE };
+        SkShader* s = SkGradientShader::CreateLinear(pts, colors, NULL, 2, SkShader::kClamp_TileMode);
+#else
+        SkShader* s = make_heatGradient(pts);
+#endif
+        
+        canvas->clear(SK_ColorRED);
+        SkPaint paint;
+        paint.setShader(s)->unref();
+        SkRect r = { 0, 0, 1024, HEIGHT };
+        canvas->drawRect(r, paint);
+    }
+
+    virtual void onDraw(SkCanvas* canvas) {
+#ifdef SK_BUILD_FOR_MAC
+        CGContextRef cg = makeCG(canvas->getDevice()->accessBitmap(false));
+#endif
+        
+        drawGrad(canvas);
+
+        const SkColor fg[] = {
+            0xFFFFFFFF,
+            0xFFFFFF00, 0xFFFF00FF, 0xFF00FFFF,
+            0xFFFF0000, 0xFF00FF00, 0xFF0000FF,
+            0xFF000000,
+        };
+        
+        const char* text = "Hamburgefons";
+        size_t len = strlen(text);
+
+        SkPaint paint;
+        setFont(&paint, "Times");
+        paint.setTextSize(SkIntToScalar(16));
+        paint.setAntiAlias(true);
+        paint.setLCDRenderText(true);
+
+        SkScalar x = 10;
+        for (size_t i = 0; i < SK_ARRAY_COUNT(fg); ++i) {
+            paint.setColor(fg[i]);
+            
+            SkScalar y = 40;
+            SkScalar stopy = HEIGHT;
+            while (y < stopy) {
+#if 1
+                canvas->drawText(text, len, x, y, paint);
+#else
+                cgDrawText(cg, text, len, x, HEIGHT - y, paint);
+#endif
+                y += paint.getTextSize() * 2;
+            }
+            x += SkIntToScalar(1024) / SK_ARRAY_COUNT(fg);
+        }
+    }
+
+private:
+    typedef GM INHERITED;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+
+static GM* MyFactory(void*) { return new GammaTextGM; }
+static GMRegistry reg(MyFactory);
+
+}
diff --git a/gm/gmmain.cpp b/gm/gmmain.cpp
index 0af2933..523faf8 100644
--- a/gm/gmmain.cpp
+++ b/gm/gmmain.cpp
@@ -6,19 +6,21 @@
  */
 
 #include "gm.h"
+#include "system_preferences.h"
 #include "GrContext.h"
 #include "GrRenderTarget.h"
 
 #include "SkColorPriv.h"
 #include "SkData.h"
+#include "SkDeferredCanvas.h"
 #include "SkDevice.h"
 #include "SkGpuCanvas.h"
 #include "SkGpuDevice.h"
 #include "SkGraphics.h"
 #include "SkImageDecoder.h"
 #include "SkImageEncoder.h"
-#include "SkNativeGLContext.h"
-#include "SkMesaGLContext.h"
+#include "gl/SkNativeGLContext.h"
+#include "gl/SkMesaGLContext.h"
 #include "SkPicture.h"
 #include "SkStream.h"
 #include "SkRefCnt.h"
@@ -32,6 +34,9 @@
     #include "SkPDFDocument.h"
 #endif
 
+// Until we resolve http://code.google.com/p/skia/issues/detail?id=455 ,
+// stop writing out XPS-format image baselines in gm.
+#undef SK_SUPPORT_XPS
 #ifdef SK_SUPPORT_XPS
     #include "SkXPSDevice.h"
 #endif
@@ -268,26 +273,41 @@
 static ErrorBitfield generate_image(GM* gm, const ConfigData& gRec,
                                     GrContext* context,
                                     GrRenderTarget* rt,
-                                    SkBitmap* bitmap) {
+                                    SkBitmap* bitmap,
+                                    bool deferred) {
     SkISize size (gm->getISize());
     setup_bitmap(gRec, size, bitmap);
-    SkCanvas canvas(*bitmap);
 
     if (gRec.fBackend == kRaster_Backend) {
-        invokeGM(gm, &canvas);
+        SkCanvas* canvas;
+        if (deferred) {
+            canvas = new SkDeferredCanvas;
+            canvas->setDevice(new SkDevice(*bitmap))->unref();
+        } else {
+            canvas = new SkCanvas(*bitmap);
+        }
+        SkAutoUnref canvasUnref(canvas);
+        invokeGM(gm, canvas);
+        canvas->flush();
     } else {  // GPU
         if (NULL == context) {
             return ERROR_NO_GPU_CONTEXT;
         }
-        SkGpuCanvas gc(context, rt);
-        gc.setDevice(new SkGpuDevice(context, rt))->unref();
-        invokeGM(gm, &gc);
+        SkCanvas* gc;
+        if (deferred) {
+            gc = new SkDeferredCanvas;
+        } else {
+            gc = new SkGpuCanvas(context, rt);
+        }
+        SkAutoUnref gcUnref(gc);
+        gc->setDevice(new SkGpuDevice(context, rt))->unref();
+        invokeGM(gm, gc);
         // the device is as large as the current rendertarget, so we explicitly
         // only readback the amount we expect (in size)
         // overwrite our previous allocation
         bitmap->setConfig(SkBitmap::kARGB_8888_Config, size.fWidth,
                                                        size.fHeight);
-        gc.readPixels(bitmap, 0, 0);
+        gc->readPixels(bitmap, 0, 0);
     }
     return ERROR_NONE;
 }
@@ -488,7 +508,8 @@
     if (gRec.fBackend == kRaster_Backend ||
         gRec.fBackend == kGPU_Backend) {
         // Early exit if we can't generate the image.
-        ErrorBitfield errors = generate_image(gm, gRec, context, rt, bitmap);
+        ErrorBitfield errors = generate_image(gm, gRec, context, rt, bitmap,
+            false);
         if (ERROR_NONE != errors) {
             return errors;
         }
@@ -506,6 +527,28 @@
                                "", *bitmap, &document, NULL);
 }
 
+static ErrorBitfield test_deferred_drawing(GM* gm,
+                         const ConfigData& gRec,
+                         const SkBitmap& comparisonBitmap,
+                         const char diffPath [],
+                         GrContext* context,
+                         GrRenderTarget* rt) {
+    SkDynamicMemoryWStream document;
+
+    if (gRec.fBackend == kRaster_Backend ||
+        gRec.fBackend == kGPU_Backend) {
+        SkBitmap bitmap;
+        // Early exit if we can't generate the image, but this is
+        // expected in some cases, so don't report a test failure.
+        if (!generate_image(gm, gRec, context, rt, &bitmap, true)) {
+            return ERROR_NONE;
+        }
+        return handle_test_results(gm, gRec, NULL, NULL, diffPath,
+                                   "-deferred", bitmap, NULL, &comparisonBitmap);
+    }
+    return ERROR_NONE;
+}
+
 static ErrorBitfield test_picture_playback(GM* gm,
                                            const ConfigData& gRec,
                                            const SkBitmap& comparisonBitmap,
@@ -545,20 +588,30 @@
 }
 
 static void usage(const char * argv0) {
-    SkDebugf("%s [-w writePath] [-r readPath] [-d diffPath]\n", argv0);
-    SkDebugf("    [--replay] [--serialize]\n");
+    SkDebugf(
+        "%s [-w writePath] [-r readPath] [-d diffPath] [--noreplay]\n"
+        "    [--serialize] [--forceBWtext] [--nopdf] [--nodeferred]\n"
+        "    [--match substring] [--notexturecache]"
+#if SK_MESA
+        " [--mesagl]"
+#endif
+        "\n\n", argv0);
     SkDebugf("    writePath: directory to write rendered images in.\n");
     SkDebugf(
 "    readPath: directory to read reference images from;\n"
 "        reports if any pixels mismatch between reference and new images\n");
     SkDebugf("    diffPath: directory to write difference images in.\n");
-    SkDebugf("    --replay: exercise SkPicture replay.\n");
+    SkDebugf("    --noreplay: do not exercise SkPicture replay.\n");
     SkDebugf(
 "    --serialize: exercise SkPicture serialization & deserialization.\n");
+    SkDebugf("    --forceBWtext: disable text anti-aliasing.\n");
+    SkDebugf("    --nopdf: skip the pdf rendering test pass.\n");
+    SkDebugf("    --nodeferred: skip the deferred rendering test pass.\n");
     SkDebugf("    --match foo will only run tests that substring match foo.\n");
 #if SK_MESA
     SkDebugf("    --mesagl will run using the osmesa sw gl rasterizer.\n");
 #endif
+    SkDebugf("    --notexturecache: disable the gpu texture cache.\n");
 }
 
 static const ConfigData gRec[] = {
@@ -602,17 +655,21 @@
     // we don't need to see this during a run
     gSkSuppressFontCachePurgeSpew = true;
 
+    setSystemPreferences();
+
     const char* writePath = NULL;   // if non-null, where we write the originals
     const char* readPath = NULL;    // if non-null, were we read from to compare
     const char* diffPath = NULL;    // if non-null, where we write our diffs (from compare)
 
     SkTDArray<const char*> fMatches;
-    
+
     bool doPDF = true;
     bool doReplay = true;
     bool doSerialize = false;
     bool useMesa = false;
-    
+    bool doDeferred = true;
+    bool disableTextureCache = false;
+
     const char* const commandName = argv[0];
     char* const* stop = argv + argc;
     for (++argv; argv < stop; ++argv) {
@@ -637,6 +694,8 @@
             doReplay = false;
         } else if (strcmp(*argv, "--nopdf") == 0) {
             doPDF = false;
+        } else if (strcmp(*argv, "--nodeferred") == 0) {
+            doDeferred = false;
         } else if (strcmp(*argv, "--serialize") == 0) {
             doSerialize = true;
         } else if (strcmp(*argv, "--match") == 0) {
@@ -649,6 +708,8 @@
         } else if (strcmp(*argv, "--mesagl") == 0) {
             useMesa = true;
 #endif
+        } else if (strcmp(*argv, "--notexturecache") == 0) {
+          disableTextureCache = true;
         } else {
           usage(commandName);
           return -1;
@@ -705,6 +766,10 @@
     int testsFailed = 0;
     int testsMissingReferenceImages = 0;
 
+    if (disableTextureCache) {
+        skiagm::GetGr()->setTextureCacheLimits(0, 0);
+    }
+
     iter.reset();
     while ((gm = iter.next()) != NULL) {
         const char* shortName = gm->shortName();
@@ -734,7 +799,7 @@
         for (size_t i = 0; i < SK_ARRAY_COUNT(gRec); i++) {
             // Skip any tests that we don't even need to try.
             uint32_t gmFlags = gm->getFlags();
-            if ((kPDF_Backend == gRec[i].fBackend) && 
+            if ((kPDF_Backend == gRec[i].fBackend) &&
                 (!doPDF || (gmFlags & GM::kSkipPDF_Flag)))
             {
                 continue;
@@ -758,6 +823,14 @@
                                            rt.get(), &forwardRenderedBitmap);
             }
 
+            if (doDeferred && !testErrors &&
+                (kGPU_Backend == gRec[i].fBackend ||
+                kRaster_Backend == gRec[i].fBackend)) {
+                testErrors |= test_deferred_drawing(gm, gRec[i],
+                                    forwardRenderedBitmap,
+                                    diffPath, gGrContext, rt.get());
+            }
+
             if ((ERROR_NONE == testErrors) && doReplay &&
                 !(gmFlags & GM::kSkipPicture_Flag)) {
                 testErrors |= test_picture_playback(gm, gRec[i],
diff --git a/gm/gradients.cpp b/gm/gradients.cpp
index aac8a96..3eb5633 100644
--- a/gm/gradients.cpp
+++ b/gm/gradients.cpp
@@ -213,6 +213,48 @@
     typedef GM INHERITED;
 };
 
+/// Checks quality of large radial gradients, which may display
+/// some banding.
+
+class RadialGradientGM : public GM {
+public:
+    RadialGradientGM() {}
+
+protected:
+    SkString onShortName() { return SkString("radial_gradient"); }
+    virtual SkISize onISize() { return make_isize(1280, 1280); }
+    void drawBG(SkCanvas* canvas) {
+        canvas->drawColor(0xFF000000);
+    }
+    virtual void onDraw(SkCanvas* canvas) {
+        const SkISize dim = this->getISize();
+
+        this->drawBG(canvas);
+ 
+        SkPaint paint;
+        paint.setDither(true);
+        SkPoint center;
+        center.set(SkIntToScalar(dim.width())/2, SkIntToScalar(dim.height())/2);
+        SkScalar radius = SkIntToScalar(dim.width())/2;
+        const SkColor colors[] = { 0x7f7f7f7f, 0x7f7f7f7f, 0xb2000000 };
+        const SkScalar pos[] = { SkFloatToScalar(0.0),
+                             SkFloatToScalar(0.35),
+                             SkFloatToScalar(1.0) };
+        SkShader* shader =
+            SkGradientShader::CreateRadial(center, radius, colors,
+                                           pos, SK_ARRAY_COUNT(pos),
+                                           SkShader::kClamp_TileMode);
+        paint.setShader(shader)->unref();
+        SkRect r = {
+            0, 0, SkIntToScalar(dim.width()), SkIntToScalar(dim.height())
+        };
+        canvas->drawRect(r, paint);
+    }
+private:
+    typedef GM INHERITED;
+};
+
+
 
 ///////////////////////////////////////////////////////////////////////////////
 
@@ -225,5 +267,7 @@
 static GM* MyFactory3(void*) { return new ClampedGradientsGM; }
 static GMRegistry reg3(MyFactory3);
 
+static GM* MyFactory4(void*) { return new RadialGradientGM; }
+static GMRegistry reg4(MyFactory4);
 }
 
diff --git a/gm/morphology.cpp b/gm/morphology.cpp
new file mode 100644
index 0000000..bfaa406
--- /dev/null
+++ b/gm/morphology.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2012 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "gm.h"
+#include "SkMorphologyImageFilter.h"
+
+#define WIDTH 640
+#define HEIGHT 480
+
+namespace skiagm {
+
+class MorphologyGM : public GM {
+public:
+    MorphologyGM() {
+        this->setBGColor(0xFF000000);
+        fOnce = false;
+    }
+    
+protected:
+    virtual SkString onShortName() {
+        return SkString("morphology");
+    }
+
+    void make_bitmap() {
+        fBitmap.setConfig(SkBitmap::kARGB_8888_Config, 135, 135);
+        fBitmap.allocPixels();
+        SkDevice device(fBitmap);
+        SkCanvas canvas(&device);
+        canvas.clear(0x0);
+        SkPaint paint;
+        paint.setAntiAlias(true);
+        const char* str1 = "ABC";
+        const char* str2 = "XYZ";
+        paint.setColor(0xFFFFFFFF);
+        paint.setTextSize(64);
+        canvas.drawText(str1, strlen(str1), 10, 55, paint);
+        canvas.drawText(str2, strlen(str2), 10, 110, paint);
+    }
+
+    virtual SkISize onISize() {
+        return make_isize(WIDTH, HEIGHT);
+    }
+    virtual void onDraw(SkCanvas* canvas) {
+        if (!fOnce) {
+            make_bitmap();
+            fOnce = true;
+        }
+        struct {
+            int fRadiusX, fRadiusY;
+            bool erode;
+            SkScalar fX, fY;
+        } samples[] = {
+            { 0, 0, false, 0,   0 },
+            { 0, 2, false, 140, 0 },
+            { 2, 0, false, 280, 0 },
+            { 2, 2, false, 420, 0 },
+            { 0, 0, true,  0,   140 },
+            { 0, 2, true,  140, 140 },
+            { 2, 0, true,  280, 140 },
+            { 2, 2, true,  420, 140 },
+        };
+        const char* str = "The quick brown fox jumped over the lazy dog.";
+        SkPaint paint;
+        for (unsigned i = 0; i < SK_ARRAY_COUNT(samples); ++i) {
+            if (samples[i].erode) {
+                paint.setImageFilter(new SkErodeImageFilter(
+                    samples[i].fRadiusX,
+                    samples[i].fRadiusY))->unref();
+            } else {
+                paint.setImageFilter(new SkDilateImageFilter(
+                    samples[i].fRadiusX,
+                    samples[i].fRadiusY))->unref();
+            }
+            SkRect bounds = SkRect::MakeXYWH(samples[i].fX,
+                                             samples[i].fY,
+                                             140, 140);
+            canvas->saveLayer(&bounds, &paint);
+            canvas->drawBitmap(fBitmap, samples[i].fX, samples[i].fY);
+            canvas->restore();
+        }
+    }
+    
+private:
+    typedef GM INHERITED;
+    SkBitmap fBitmap;
+    bool fOnce;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+
+static GM* MyFactory(void*) { return new MorphologyGM; }
+static GMRegistry reg(MyFactory);
+
+}
diff --git a/gm/patheffects.cpp b/gm/patheffects.cpp
new file mode 100644
index 0000000..c606116
--- /dev/null
+++ b/gm/patheffects.cpp
@@ -0,0 +1,160 @@
+/*
+ * Copyright 2012 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+#include "gm.h"
+#include "SkCanvas.h"
+#include "SkPaint.h"
+#include "Sk1DPathEffect.h"
+#include "Sk2DPathEffect.h"
+#include "SkCornerPathEffect.h"
+#include "SkDashPathEffect.h"
+#include "SkDiscretePathEffect.h"
+
+namespace skiagm {
+
+static void compose_pe(SkPaint* paint) {
+    SkPathEffect* pe = paint->getPathEffect();
+    SkPathEffect* corner = new SkCornerPathEffect(25);
+    SkPathEffect* compose;
+    if (pe) {
+        compose = new SkComposePathEffect(pe, corner);
+        corner->unref();
+    } else {
+        compose = corner;
+    }
+    paint->setPathEffect(compose)->unref();
+}
+
+static void hair_pe(SkPaint* paint) {
+    paint->setStrokeWidth(0);
+}
+
+static void hair2_pe(SkPaint* paint) {
+    paint->setStrokeWidth(0);
+    compose_pe(paint);
+}
+
+static void stroke_pe(SkPaint* paint) {
+    paint->setStrokeWidth(12);
+    compose_pe(paint);
+}
+
+static void dash_pe(SkPaint* paint) {
+    SkScalar inter[] = { 20, 10, 10, 10 };
+    paint->setStrokeWidth(12);
+    paint->setPathEffect(new SkDashPathEffect(inter, SK_ARRAY_COUNT(inter),
+                                              0))->unref();
+    compose_pe(paint);
+}
+
+static const int gXY[] = {
+4, 0, 0, -4, 8, -4, 12, 0, 8, 4, 0, 4
+};
+
+static void scale(SkPath* path, SkScalar scale) {
+    SkMatrix m;
+    m.setScale(scale, scale);
+    path->transform(m);
+}
+
+static void one_d_pe(SkPaint* paint) {
+    SkPath  path;
+    path.moveTo(SkIntToScalar(gXY[0]), SkIntToScalar(gXY[1]));
+    for (unsigned i = 2; i < SK_ARRAY_COUNT(gXY); i += 2)
+        path.lineTo(SkIntToScalar(gXY[i]), SkIntToScalar(gXY[i+1]));
+    path.close();
+    path.offset(SkIntToScalar(-6), 0);
+    scale(&path, 1.5);
+    
+    paint->setPathEffect(new SkPath1DPathEffect(path, SkIntToScalar(21), 0,
+                                SkPath1DPathEffect::kRotate_Style))->unref();
+    compose_pe(paint);
+}
+
+typedef void (*PE_Proc)(SkPaint*);
+static const PE_Proc gPE[] = { hair_pe, hair2_pe, stroke_pe, dash_pe, one_d_pe };
+
+static void fill_pe(SkPaint* paint) {
+    paint->setStyle(SkPaint::kFill_Style);
+    paint->setPathEffect(NULL);
+}
+
+static void discrete_pe(SkPaint* paint) {
+    paint->setPathEffect(new SkDiscretePathEffect(10, 4))->unref();
+}
+
+static SkPathEffect* MakeTileEffect() {
+    SkMatrix m;
+    m.setScale(SkIntToScalar(12), SkIntToScalar(12));
+
+    SkPath path;
+    path.addCircle(0, 0, SkIntToScalar(5));
+    
+    return new SkPath2DPathEffect(m, path);
+}
+
+static void tile_pe(SkPaint* paint) {
+    paint->setPathEffect(MakeTileEffect())->unref();
+}
+
+static const PE_Proc gPE2[] = { fill_pe, discrete_pe, tile_pe };
+
+class PathEffectGM : public GM {
+public:
+    PathEffectGM() {}
+
+protected:
+    SkString onShortName() {
+        return SkString("patheffect");
+    }
+
+    SkISize onISize() { return make_isize(800, 600); }
+
+    virtual void onDraw(SkCanvas* canvas) {
+        SkPaint paint;
+        paint.setAntiAlias(true);
+        paint.setStyle(SkPaint::kStroke_Style);
+
+        SkPath path;
+        path.moveTo(20, 20);
+        path.lineTo(70, 120);
+        path.lineTo(120, 30);
+        path.lineTo(170, 80);
+        path.lineTo(240, 50);
+
+        size_t i;
+        canvas->save();
+        for (i = 0; i < SK_ARRAY_COUNT(gPE); i++) {
+            gPE[i](&paint);
+            canvas->drawPath(path, paint);
+            canvas->translate(0, 75);
+        }
+        canvas->restore();
+
+        path.reset();
+        SkRect r = { 0, 0, 250, 120 };
+        path.addOval(r, SkPath::kCW_Direction);
+        r.inset(50, 50);
+        path.addRect(r, SkPath::kCCW_Direction);
+
+        canvas->translate(320, 20);
+        for (i = 0; i < SK_ARRAY_COUNT(gPE2); i++) {
+            gPE2[i](&paint);
+            canvas->drawPath(path, paint);
+            canvas->translate(0, 160);
+        }
+    }
+
+private:
+    typedef GM INHERITED;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+
+static GM* PathEffectFactory(void*) { return new PathEffectGM; }
+static GMRegistry regPathEffect(PathEffectFactory);
+
+}
diff --git a/gm/strokefill.cpp b/gm/strokefill.cpp
index a37af80..75fa008 100644
--- a/gm/strokefill.cpp
+++ b/gm/strokefill.cpp
@@ -27,24 +27,35 @@
         return make_isize(640, 480);
     }
 
+    static void show_bold(SkCanvas* canvas, const char text[], SkScalar x,
+                          SkScalar y, const SkPaint& paint) {
+        size_t len = strlen(text);
+        SkPaint p(paint);
+        canvas->drawText(text, len, x, y, p);
+        p.setFakeBoldText(true);
+        canvas->drawText(text, len, x, y + SkIntToScalar(120), p);
+    }
+
     virtual void onDraw(SkCanvas* canvas) {
+        SkScalar x = SkIntToScalar(100);
+        SkScalar y = SkIntToScalar(88);
+
         SkPaint paint;
-        const char text[] = "Hello"; // "Hello";
-        const size_t len = sizeof(text) - 1;
         paint.setAntiAlias(true);
         paint.setTextSize(SkIntToScalar(100));
-//        SkTypeface* hira = SkTypeface::CreateFromName("Hiragino Maru Gothic Pro", SkTypeface::kNormal);
-        SkTypeface* hira = SkTypeface::CreateFromName("Papyrus", SkTypeface::kNormal);
-        paint.setTypeface(hira);
-        SkScalar x = SkIntToScalar(180);
-        SkScalar y = SkIntToScalar(88);
-        
-        canvas->drawText(text, len, x, y, paint);
-        paint.setFakeBoldText(true);
-        canvas->drawText(text, len, x, y + SkIntToScalar(100), paint);
-        paint.setStyle(SkPaint::kStrokeAndFill_Style);
         paint.setStrokeWidth(SkIntToScalar(5));
         
+        SkTypeface* face = SkTypeface::CreateFromName("Papyrus", SkTypeface::kNormal);
+        SkSafeUnref(paint.setTypeface(face));
+        show_bold(canvas, "Hello", x, y, paint);
+
+        face = SkTypeface::CreateFromName("Hiragino Maru Gothic Pro", SkTypeface::kNormal);
+        SkSafeUnref(paint.setTypeface(face));
+        const char hyphen[] = { 0xE3, 0x83, 0xBC, 0 };
+        show_bold(canvas, hyphen, x + SkIntToScalar(300), y, paint);
+
+        paint.setStyle(SkPaint::kStrokeAndFill_Style);
+
         SkPath path;
         path.setFillType(SkPath::kWinding_FillType);
         path.addCircle(x, y + SkIntToScalar(200), SkIntToScalar(50), SkPath::kCW_Direction);
diff --git a/gm/system_preferences.h b/gm/system_preferences.h
new file mode 100644
index 0000000..0fcf489
--- /dev/null
+++ b/gm/system_preferences.h
@@ -0,0 +1,12 @@
+/*
+ * Copyright 2012 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+// Set up system preferences with a known state.
+// This is implemented on a per-platform basis.
+// TODO(epoger): move this out of gm/ into a more common directory, so
+// that it can be used more broadly.
+void setSystemPreferences();
diff --git a/gm/system_preferences_default.cpp b/gm/system_preferences_default.cpp
new file mode 100644
index 0000000..1fc4e6c
--- /dev/null
+++ b/gm/system_preferences_default.cpp
@@ -0,0 +1,10 @@
+/*
+ * Copyright 2012 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+// This is a default implementation of setSystemPreferences() that does nothing.
+void setSystemPreferences() {
+}
diff --git a/gm/system_preferences_mac.mm b/gm/system_preferences_mac.mm
new file mode 100644
index 0000000..2c772a1
--- /dev/null
+++ b/gm/system_preferences_mac.mm
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2012 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#import <Cocoa/Cocoa.h>
+
+void setSystemPreferences() {
+    NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
+
+    // Set LCD font smoothing level for this application (does not affect other
+    // applications). Based on resetDefaultsToConsistentValues() in
+    // http://trac.webkit.org/browser/trunk/Tools/DumpRenderTree/mac/DumpRenderTree.mm
+    static const int NoFontSmoothing     = 0;
+    static const int LightFontSmoothing  = 1;
+    static const int MediumFontSmoothing = 2;
+    static const int StrongFontSmoothing = 3;
+    NSUserDefaults *defaults = [NSUserDefaults standardUserDefaults];
+    [defaults setInteger:MediumFontSmoothing forKey:@"AppleFontSmoothing"];
+
+    [pool release];
+}
diff --git a/gm/texdata.cpp b/gm/texdata.cpp
index c68a16a..1fdae69 100644
--- a/gm/texdata.cpp
+++ b/gm/texdata.cpp
@@ -75,13 +75,13 @@
                 }
 
                 GrTextureDesc desc;
-                desc.fAALevel   = kNone_GrAALevel;
                 // use RT flag bit because in GL it makes the texture be bottom-up
                 desc.fFlags     = i ? kRenderTarget_GrTextureFlagBit :
                                       kNone_GrTextureFlags;
                 desc.fConfig    = kSkia8888_PM_GrPixelConfig;
                 desc.fWidth     = 2 * S;
                 desc.fHeight    = 2 * S;
+                desc.fSampleCnt = 0;
                 GrTexture* texture = 
                     ctx->createUncachedTexture(desc, gTextureData, 0);
 
diff --git a/gyp/SampleApp.gyp b/gyp/SampleApp.gyp
index fed4bf4..1f97dc6 100644
--- a/gyp/SampleApp.gyp
+++ b/gyp/SampleApp.gyp
@@ -153,7 +153,6 @@
         'utils.gyp:utils',
         'animator.gyp:animator',
         'xml.gyp:xml',
-        'svg.gyp:svg',
         'experimental.gyp:experimental',
         'gpu.gyp:gr',
         'gpu.gyp:skgr',
diff --git a/gyp/animator.gyp b/gyp/animator.gyp
index fc61e71..f713f3f 100644
--- a/gyp/animator.gyp
+++ b/gyp/animator.gyp
@@ -15,6 +15,7 @@
         '../include/xml',
         '../include/utils',
         '../include/images',
+        '../src/utils',
       ],
       'sources': [
         '../include/animator/SkAnimator.h',
@@ -36,8 +37,6 @@
         '../src/animator/SkAnimatorScript.h',
         #'../src/animator/SkAnimatorScript2.cpp', fails on windows
         #'../src/animator/SkAnimatorScript2.h',
-        '../src/animator/SkBase64.cpp',
-        '../src/animator/SkBase64.h',
         '../src/animator/SkBoundable.cpp',
         '../src/animator/SkBoundable.h',
         '../src/animator/SkBuildCondensedInfo.cpp',
diff --git a/gyp/bench.gypi b/gyp/bench.gypi
index 5fbfef7..2ea3660 100644
--- a/gyp/bench.gypi
+++ b/gyp/bench.gypi
@@ -28,6 +28,7 @@
     '../bench/MatrixBench.cpp',
     '../bench/MutexBench.cpp',
     '../bench/PathBench.cpp',
+    '../bench/PicturePlaybackBench.cpp',
     '../bench/RectBench.cpp',
     '../bench/RepeatTileBench.cpp',
     '../bench/ScalarBench.cpp',
diff --git a/gyp/common_conditions.gypi b/gyp/common_conditions.gypi
index e18cbc1..7152181 100644
--- a/gyp/common_conditions.gypi
+++ b/gyp/common_conditions.gypi
@@ -78,13 +78,16 @@
         'defines': [
           'SK_SAMPLES_FOR_X',
           'SK_BUILD_FOR_UNIX',
+          'SK_USE_COLOR_LUMINANCE',
+          'SK_GAMMA_APPLY_TO_A8',
         ],
         'configurations': {
           'Debug': {
             'cflags': ['-g']
           },
           'Release': {
-            'cflags': ['-O2']
+            'cflags': ['-O2'],
+            'defines': [ 'NDEBUG' ],
           },
         },
         'cflags': [
@@ -119,6 +122,7 @@
             'xcode_settings': {
               'GCC_OPTIMIZATION_LEVEL': '3',
             },
+            'defines': [ 'NDEBUG' ],
           },
         },
         'xcode_settings': {
@@ -161,7 +165,8 @@
             'cflags': ['-g']
           },
           'Release': {
-            'cflags': ['-O2']
+            'cflags': ['-O2'],
+            'defines': [ 'NDEBUG' ],
           },
         },
         'libraries': [
@@ -174,6 +179,12 @@
           '-fno-rtti',
         ],
         'conditions': [
+          [ 'skia_target_arch == "arm"', {
+            'ldflags': [
+              '-Wl',
+              '--fix-cortex-a8',
+            ],
+          }],
           [ 'skia_target_arch == "arm" and arm_thumb == 1', {
             'cflags': [
               '-mthumb',
@@ -185,6 +196,7 @@
             ],
             'cflags': [
               '-march=armv7-a',
+              '-mfloat-abi=softfp',
             ],
             'conditions': [
               [ 'arm_neon == 1', {
@@ -192,7 +204,6 @@
                   '__ARM_HAVE_NEON',
                 ],
                 'cflags': [
-                  '-mfloat-abi=softfp',
                   '-mfpu=neon',
                 ],
              }],
@@ -202,6 +213,14 @@
       },
     ],
 
+    # We can POD-style initialization of static mutexes to avoid generating
+    # static initializers if we're using a pthread-compatible thread interface.
+    [ 'skia_os != "win"', {
+      'defines': [
+        'SK_USE_POSIX_THREADS'
+      ],
+    }],
+
   ], # end 'conditions'
 }
 
diff --git a/gyp/core.gyp b/gyp/core.gyp
index f83f482..077d395 100644
--- a/gyp/core.gyp
+++ b/gyp/core.gyp
@@ -52,6 +52,7 @@
         '../src/core/SkComposeShader.cpp',
         '../src/core/SkConcaveToTriangles.cpp',
         '../src/core/SkConcaveToTriangles.h',
+        '../src/core/SkConfig8888.cpp',
         '../src/core/SkConfig8888.h',
         '../src/core/SkCordic.cpp',
         '../src/core/SkCordic.h',
@@ -62,6 +63,7 @@
         '../src/core/SkDebug.cpp',
         '../src/core/SkDeque.cpp',
         '../src/core/SkDevice.cpp',
+        '../src/core/SkDeviceProfile.cpp',
         '../src/core/SkDither.cpp',
         '../src/core/SkDraw.cpp',
         '../src/core/SkDrawProcs.h',
diff --git a/gyp/effects.gyp b/gyp/effects.gyp
index 28f0017..ac56510 100644
--- a/gyp/effects.gyp
+++ b/gyp/effects.gyp
@@ -32,6 +32,7 @@
         '../include/effects/SkKernel33MaskFilter.h',
         '../include/effects/SkLayerDrawLooper.h',
         '../include/effects/SkLayerRasterizer.h',
+        '../include/effects/SkMorphologyImageFilter.h',
         '../include/effects/SkPaintFlagsDrawFilter.h',
         '../include/effects/SkPixelXorXfermode.h',
         '../include/effects/SkPorterDuff.h',
@@ -66,6 +67,7 @@
         '../src/effects/SkKernel33MaskFilter.cpp',
         '../src/effects/SkLayerDrawLooper.cpp',
         '../src/effects/SkLayerRasterizer.cpp',
+        '../src/effects/SkMorphologyImageFilter.cpp',
         '../src/effects/SkPaintFlagsDrawFilter.cpp',
         '../src/effects/SkPixelXorXfermode.cpp',
         '../src/effects/SkPorterDuff.cpp',
diff --git a/gyp/gm.gyp b/gyp/gm.gyp
index 43075dc..3c5e12e 100644
--- a/gyp/gm.gyp
+++ b/gyp/gm.gyp
@@ -14,6 +14,7 @@
       'sources': [
         '../gm/gm.cpp',
         '../gm/gmmain.cpp',
+        '../gm/system_preferences_default.cpp',
       ],
       'dependencies': [
         'core.gyp:core',
@@ -25,9 +26,22 @@
         'pdf.gyp:pdf',
         'utils.gyp:utils',        
       ],
-      #mac does not like empty dependency.
       'conditions': [
-        [ 'skia_os == "win"', {
+        ['skia_os == "mac"', {
+          'sources!': [
+            '../gm/system_preferences_default.cpp',
+          ],
+          'sources': [
+            '../gm/system_preferences_mac.mm',
+          ],
+          'link_settings': {
+            'libraries': [
+              '$(SDKROOT)/System/Library/Frameworks/Cocoa.framework',
+              '$(SDKROOT)/System/Library/Frameworks/Foundation.framework',
+            ],
+          },
+        }],
+        ['skia_os == "win"', {
           'dependencies': [
             'xps.gyp:xps',
           ],
diff --git a/gyp/gmslides.gypi b/gyp/gmslides.gypi
index 66692d1..2c96e45 100644
--- a/gyp/gmslides.gypi
+++ b/gyp/gmslides.gypi
@@ -11,6 +11,7 @@
     '../gm/colormatrix.cpp',
     '../gm/complexclip.cpp',
     '../gm/complexclip2.cpp',
+    '../gm/convexpaths.cpp',
     '../gm/cubicpaths.cpp',
     '../gm/degeneratesegments.cpp',
     '../gm/drawbitmaprect.cpp',
@@ -18,14 +19,17 @@
     '../gm/filltypes.cpp',
     '../gm/filltypespersp.cpp',
     '../gm/fontscaler.cpp',
+    '../gm/gammatext.cpp',
     '../gm/gradients.cpp',
     '../gm/gradtext.cpp',
     '../gm/hairmodes.cpp',
     '../gm/imageblur.cpp',
     '../gm/lcdtext.cpp',
     '../gm/linepaths.cpp',
+    '../gm/morphology.cpp',
     '../gm/ninepatchstretch.cpp',
     '../gm/nocolorbleed.cpp',
+    '../gm/patheffects.cpp',
     '../gm/pathfill.cpp',
     '../gm/pathreverse.cpp',
     '../gm/points.cpp',
diff --git a/gyp/gpu.gyp b/gyp/gpu.gyp
index 9fcded1..bb13bbd 100644
--- a/gyp/gpu.gyp
+++ b/gyp/gpu.gyp
@@ -96,23 +96,25 @@
         '../include/gpu',
       ],
       'sources': [
-        '../include/gpu/SkGLContext.h',
-        '../include/gpu/SkMesaGLContext.h',
-        '../include/gpu/SkNativeGLContext.h',
-        '../include/gpu/SkNullGLContext.h',
         '../include/gpu/SkGpuCanvas.h',
         '../include/gpu/SkGpuDevice.h',
         '../include/gpu/SkGr.h',
         '../include/gpu/SkGrTexturePixelRef.h',
 
+        '../include/gpu/gl/SkGLContext.h',
+        '../include/gpu/gl/SkMesaGLContext.h',
+        '../include/gpu/gl/SkNativeGLContext.h',
+        '../include/gpu/gl/SkNullGLContext.h',
+
         '../src/gpu/GrPrintf_skia.cpp',
-        '../src/gpu/SkGLContext.cpp',
         '../src/gpu/SkGpuCanvas.cpp',
         '../src/gpu/SkGpuDevice.cpp',
         '../src/gpu/SkGr.cpp',
         '../src/gpu/SkGrFontScaler.cpp',
         '../src/gpu/SkGrTexturePixelRef.cpp',
-        '../src/gpu/SkNullGLContext.cpp',
+
+        '../src/gpu/gl/SkGLContext.cpp',
+        '../src/gpu/gl/SkNullGLContext.cpp',
 
         '../src/gpu/android/SkNativeGLContext_android.cpp',
 
@@ -144,6 +146,7 @@
         '../include/core',
         '../include/config',
         '../include/gpu',
+        '../src/core', # SkRasterClip.h
       ],
       'dependencies': [
         'libtess.gyp:libtess',
@@ -155,10 +158,6 @@
         '../include/gpu/GrConfig.h',
         '../include/gpu/GrContext.h',
         '../include/gpu/GrFontScaler.h',
-        '../include/gpu/GrGLConfig.h',
-        '../include/gpu/GrGLConfig_chrome.h',
-        '../include/gpu/GrGLDefines.h',
-        '../include/gpu/GrGLInterface.h',
         '../include/gpu/GrGlyph.h',
         '../include/gpu/GrInstanceCounter.h',
         '../include/gpu/GrKey.h',
@@ -178,9 +177,16 @@
         '../include/gpu/GrTypes.h',
         '../include/gpu/GrUserConfig.h',
 
+        '../include/gpu/gl/GrGLConfig.h',
+        '../include/gpu/gl/GrGLConfig_chrome.h',
+        '../include/gpu/gl/GrGLDefines.h',
+        '../include/gpu/gl/GrGLInterface.h',
+
         '../src/gpu/GrAAHairLinePathRenderer.cpp',
         '../src/gpu/GrAAHairLinePathRenderer.h',
-        '../src/gpu/GrAddPathRenderers_aahairline.cpp',
+        '../src/gpu/GrAAConvexPathRenderer.cpp',
+        '../src/gpu/GrAAConvexPathRenderer.h',
+        '../src/gpu/GrAddPathRenderers_default.cpp',
         '../src/gpu/GrAllocator.h',
         '../src/gpu/GrAllocPool.h',
         '../src/gpu/GrAllocPool.cpp',
@@ -197,35 +203,9 @@
         '../src/gpu/GrDrawTarget.cpp',
         '../src/gpu/GrDrawTarget.h',
         '../src/gpu/GrGeometryBuffer.h',
-        '../src/gpu/GrGLCreateNativeInterface_none.cpp',
-        '../src/gpu/GrGLCreateNullInterface.cpp',
-        '../src/gpu/GrGLDefaultInterface_none.cpp',
-        '../src/gpu/GrGLDefaultInterface_native.cpp',
-        '../src/gpu/GrGLIndexBuffer.cpp',
-        '../src/gpu/GrGLIndexBuffer.h',
-        '../src/gpu/GrGLInterface.cpp',
-        '../src/gpu/GrGLIRect.h',
-        '../src/gpu/GrGLProgram.cpp',
-        '../src/gpu/GrGLProgram.h',
-        '../src/gpu/GrGLRenderTarget.cpp',
-        '../src/gpu/GrGLRenderTarget.h',
-        '../src/gpu/GrGLShaderVar.h',
-        '../src/gpu/GrGLSL.cpp',
-        '../src/gpu/GrGLSL.h',
-        '../src/gpu/GrGLStencilBuffer.cpp',
-        '../src/gpu/GrGLStencilBuffer.h',
-        '../src/gpu/GrGLTexture.cpp',
-        '../src/gpu/GrGLTexture.h',
-        '../src/gpu/GrGLUtil.cpp',
-        '../src/gpu/GrGLVertexBuffer.cpp',
-        '../src/gpu/GrGLVertexBuffer.h',
         '../src/gpu/GrGpu.cpp',
         '../src/gpu/GrGpu.h',
         '../src/gpu/GrGpuFactory.cpp',
-        '../src/gpu/GrGpuGL.cpp',
-        '../src/gpu/GrGpuGL.h',
-        '../src/gpu/GrGpuGLShaders.cpp',
-        '../src/gpu/GrGpuGLShaders.h',
         '../src/gpu/GrGpuVertex.h',
         '../src/gpu/GrIndexBuffer.h',
         '../src/gpu/GrInOrderDrawBuffer.cpp',
@@ -266,6 +246,36 @@
         '../src/gpu/GrVertexBuffer.h',
         '../src/gpu/gr_unittests.cpp',
 
+        '../src/gpu/gl/GrGLCaps.cpp',
+        '../src/gpu/gl/GrGLCaps.h',
+        '../src/gpu/gl/GrGLContextInfo.cpp',
+        '../src/gpu/gl/GrGLContextInfo.h',
+        '../src/gpu/gl/GrGLCreateNativeInterface_none.cpp',
+        '../src/gpu/gl/GrGLCreateNullInterface.cpp',
+        '../src/gpu/gl/GrGLDefaultInterface_none.cpp',
+        '../src/gpu/gl/GrGLDefaultInterface_native.cpp',
+        '../src/gpu/gl/GrGLIndexBuffer.cpp',
+        '../src/gpu/gl/GrGLIndexBuffer.h',
+        '../src/gpu/gl/GrGLInterface.cpp',
+        '../src/gpu/gl/GrGLIRect.h',
+        '../src/gpu/gl/GrGLProgram.cpp',
+        '../src/gpu/gl/GrGLProgram.h',
+        '../src/gpu/gl/GrGLRenderTarget.cpp',
+        '../src/gpu/gl/GrGLRenderTarget.h',
+        '../src/gpu/gl/GrGLShaderVar.h',
+        '../src/gpu/gl/GrGLSL.cpp',
+        '../src/gpu/gl/GrGLSL.h',
+        '../src/gpu/gl/GrGLStencilBuffer.cpp',
+        '../src/gpu/gl/GrGLStencilBuffer.h',
+        '../src/gpu/gl/GrGLTexture.cpp',
+        '../src/gpu/gl/GrGLTexture.h',
+        '../src/gpu/gl/GrGLUtil.cpp',
+        '../src/gpu/gl/GrGLVertexBuffer.cpp',
+        '../src/gpu/gl/GrGLVertexBuffer.h',
+        '../src/gpu/gl/GrGpuGL.cpp',
+        '../src/gpu/gl/GrGpuGL.h',
+        '../src/gpu/gl/GrGpuGLShaders.cpp',
+        '../src/gpu/gl/GrGpuGLShaders.h',
 
         '../src/gpu/mac/GrGLCreateNativeInterface_mac.cpp',
 
@@ -283,8 +293,8 @@
       'conditions': [
         [ 'skia_os == "linux"', {
           'sources!': [
-            '../src/gpu/GrGLDefaultInterface_none.cpp',
-            '../src/gpu/GrGLCreateNativeInterface_none.cpp',
+            '../src/gpu/gl/GrGLDefaultInterface_none.cpp',
+            '../src/gpu/gl/GrGLCreateNativeInterface_none.cpp',
           ],
           'link_settings': {
             'libraries': [
@@ -307,8 +317,8 @@
             ],
           },
           'sources!': [
-            '../src/gpu/GrGLDefaultInterface_none.cpp',
-            '../src/gpu/GrGLCreateNativeInterface_none.cpp',
+            '../src/gpu/gl/GrGLDefaultInterface_none.cpp',
+            '../src/gpu/gl/GrGLCreateNativeInterface_none.cpp',
           ],
         }],
         [ 'skia_mesa and skia_os == "mac"', {
@@ -328,14 +338,14 @@
         }],
         [ 'skia_os == "win"', {
           'sources!': [
-            '../src/gpu/GrGLDefaultInterface_none.cpp',
-            '../src/gpu/GrGLCreateNativeInterface_none.cpp',
+            '../src/gpu/gl/GrGLDefaultInterface_none.cpp',
+            '../src/gpu/gl/GrGLCreateNativeInterface_none.cpp',
           ],
         }],
         [ 'skia_os == "android"', {
           'sources!': [
-            '../src/gpu/GrGLDefaultInterface_none.cpp',
-            '../src/gpu/GrGLCreateNativeInterface_none.cpp',
+            '../src/gpu/gl/GrGLDefaultInterface_none.cpp',
+            '../src/gpu/gl/GrGLCreateNativeInterface_none.cpp',
           ],
           'link_settings': {
             'libraries': [
diff --git a/gyp/iOSSampleApp.gyp b/gyp/iOSSampleApp.gyp
index 84708bb..b864ca6 100644
--- a/gyp/iOSSampleApp.gyp
+++ b/gyp/iOSSampleApp.gyp
@@ -154,7 +154,6 @@
         'utils.gyp:utils',
         'animator.gyp:animator',
         'xml.gyp:xml',
-        'svg.gyp:svg',
         'experimental.gyp:experimental',
         'gpu.gyp:gr',
         'gpu.gyp:skgr',
diff --git a/gyp/opts.gyp b/gyp/opts.gyp
index 1f67e00..11391a8 100644
--- a/gyp/opts.gyp
+++ b/gyp/opts.gyp
@@ -31,18 +31,23 @@
         '../src/opts',
       ],
       'conditions': [
-        [ 'skia_os in ["linux", "freebsd", "openbsd", "solaris"]', {
-          'cflags': [
-            '-msse2',
-          ],
-        }],
         [ 'skia_target_arch != "arm"', {
+          'conditions': [
+            [ 'skia_os in ["linux", "freebsd", "openbsd", "solaris"]', {
+              'cflags': [
+                '-msse2',
+              ],
+            }],
+          ],
           'sources': [
             '../src/opts/opts_check_SSE2.cpp',
             '../src/opts/SkBitmapProcState_opts_SSE2.cpp',
             '../src/opts/SkBlitRow_opts_SSE2.cpp',
             '../src/opts/SkUtils_opts_SSE2.cpp',
           ],
+          'dependencies': [
+            'opts_ssse3',
+          ],
         }],
         [ 'skia_target_arch == "arm" and armv7 == 1', {
           # The assembly uses the frame pointer register (r7 in Thumb/r11 in
@@ -71,6 +76,39 @@
         }],
       ],
     },
+    # For the same lame reasons as what is done for skia_opts, we have to
+    # create another target specifically for SSSE3 code as we would not want
+    # to compile the SSE2 code with -mssse3 which would potentially allow
+    # gcc to generate SSSE3 code.
+    {
+      'target_name': 'opts_ssse3',
+      'type': 'static_library',
+      'include_dirs': [
+        '../include/config',
+        '../include/core',
+        '../src/core',
+      ],
+      'conditions': [
+        [ 'skia_os in ["linux", "freebsd", "openbsd", "solaris"]', {
+          'cflags': [
+            '-mssse3',
+          ],
+        }],
+        # TODO(epoger): the following will enable SSSE3 on Macs, but it will
+        # break once we set OTHER_CFLAGS anywhere else (the first setting will
+        # be replaced, not added to)
+        [ 'skia_os in ["mac"]', {
+          'xcode_settings': {
+            'OTHER_CFLAGS': ['-mssse3',],
+          },
+        }],
+        [ 'skia_target_arch != "arm"', {
+          'sources': [
+            '../src/opts/SkBitmapProcState_opts_SSSE3.cpp',
+          ],
+        }],
+      ],
+    },
   ],
 }
 
diff --git a/gyp/ports.gyp b/gyp/ports.gyp
index c89a06d..5dc88d5 100644
--- a/gyp/ports.gyp
+++ b/gyp/ports.gyp
@@ -48,11 +48,15 @@
           ],
           'sources': [
             '../src/ports/SkFontHost_mac_coretext.cpp',
+            '../src/utils/mac/SkStream_mac.cpp',
 #            '../src/ports/SkFontHost_FreeType.cpp',
 #            '../src/ports/SkFontHost_freetype_mac.cpp',
 #            '../src/ports/SkFontHost_gamma_none.cpp',
             '../src/ports/SkThread_pthread.cpp',
           ],
+          'sources!': [
+            '../src/ports/SkFontHost_tables.cpp',
+          ],
         }],
         [ 'skia_os == "ios"', {
           'include_dirs': [
diff --git a/gyp/sfnt.gyp b/gyp/sfnt.gyp
new file mode 100644
index 0000000..2f2fc76
--- /dev/null
+++ b/gyp/sfnt.gyp
@@ -0,0 +1,50 @@
+{
+  'includes': [
+    'common.gypi',
+  ],
+  'targets': [
+    {
+      'target_name': 'sfnt',
+      'type': 'static_library',
+      'dependencies': [
+        'core.gyp:core',
+      ],
+      'include_dirs': [
+        '../src/sfnt',
+      ],
+      'sources': [
+        '../src/sfnt/SkIBMFamilyClass.h',
+        '../src/sfnt/SkOTTableTypes.h',
+        '../src/sfnt/SkOTTable_head.h',
+        '../src/sfnt/SkOTTable_hhea.h',
+        '../src/sfnt/SkOTTable_name.h',
+        '../src/sfnt/SkOTTable_OS_2.h',
+        '../src/sfnt/SkOTTable_OS_2_V0.h',
+        '../src/sfnt/SkOTTable_OS_2_V1.h',
+        '../src/sfnt/SkOTTable_OS_2_V2.h',
+        '../src/sfnt/SkOTTable_OS_2_V3.h',
+        '../src/sfnt/SkOTTable_OS_2_V4.h',
+        '../src/sfnt/SkOTTable_OS_2_VA.h',
+        '../src/sfnt/SkOTTable_post.h',
+        '../src/sfnt/SkPanose.h',
+        '../src/sfnt/SkOTUtils.h',
+        '../src/sfnt/SkPreprocessorSeq.h',
+        '../src/sfnt/SkSFNTHeader.h',
+        '../src/sfnt/SkTypedEnum.h',
+
+        '../src/sfnt/SkOTUtils.cpp',
+      ],
+      'direct_dependent_settings': {
+        'include_dirs': [
+          '../src/sfnt',
+        ],
+      },
+    },
+  ],
+}
+
+# Local Variables:
+# tab-width:2
+# indent-tabs-mode:nil
+# End:
+# vim: set expandtab tabstop=2 shiftwidth=2:
diff --git a/gyp/tests.gyp b/gyp/tests.gyp
index 27a34a0..cf793ab 100644
--- a/gyp/tests.gyp
+++ b/gyp/tests.gyp
@@ -27,11 +27,14 @@
         '../tests/ColorFilterTest.cpp',
         '../tests/ColorTest.cpp',
         '../tests/DataRefTest.cpp',
+        '../tests/DeferredCanvasTest.cpp',
         '../tests/DequeTest.cpp',
         '../tests/DrawBitmapRectTest.cpp',
+        '../tests/DrawTextTest.cpp',
         '../tests/EmptyPathTest.cpp',
         '../tests/FillPathTest.cpp',
         '../tests/FlateTest.cpp',
+        '../tests/FontHostTest.cpp',
         '../tests/GeometryTest.cpp',
         '../tests/GLInterfaceValidation.cpp',
         '../tests/GLProgramsTest.cpp',
@@ -49,6 +52,7 @@
         '../tests/PathTest.cpp',
         '../tests/PDFPrimitivesTest.cpp',
         '../tests/PointTest.cpp',
+        '../tests/PremulAlphaRoundTripTest.cpp',
         '../tests/QuickRejectTest.cpp',
         '../tests/Reader32Test.cpp',
         '../tests/ReadPixelsTest.cpp',
diff --git a/gyp/utils.gyp b/gyp/utils.gyp
index b4b2be3..8d2415a 100644
--- a/gyp/utils.gyp
+++ b/gyp/utils.gyp
@@ -22,6 +22,7 @@
         '../include/utils/SkCamera.h',
         '../include/utils/SkCubicInterval.h',
         '../include/utils/SkCullPoints.h',
+        '../include/utils/SkDeferredCanvas.h',
         '../include/utils/SkDumpCanvas.h',
         '../include/utils/SkInterpolator.h',
         '../include/utils/SkLayer.h',
@@ -38,11 +39,14 @@
         '../include/utils/SkUnitMappers.h',
         '../include/utils/SkWGL.h',
 
+        '../src/utils/SkBase64.cpp',
+        '../src/utils/SkBase64.h',
         '../src/utils/SkBoundaryPatch.cpp',
         '../src/utils/SkCamera.cpp',
         '../src/utils/SkColorMatrix.cpp',
         '../src/utils/SkCubicInterval.cpp',
         '../src/utils/SkCullPoints.cpp',
+        '../src/utils/SkDeferredCanvas.cpp',
         '../src/utils/SkDumpCanvas.cpp',
         '../src/utils/SkInterpolator.cpp',
         '../src/utils/SkLayer.cpp',
diff --git a/include/core/SkCanvas.h b/include/core/SkCanvas.h
index 24e4141..1b9f055 100644
--- a/include/core/SkCanvas.h
+++ b/include/core/SkCanvas.h
@@ -62,6 +62,11 @@
     ///////////////////////////////////////////////////////////////////////////
 
     /**
+     *  Trigger the immediate execution of all pending draw operations.
+     */
+    void flush();
+
+    /**
      *  Return the width/height of the underlying device. The current drawable
      *  area may be small (due to clipping or saveLayer). For a canvas with
      *  no device, 0,0 will be returned.
@@ -78,7 +83,7 @@
         reference count is incremented. If the canvas was already holding a
         device, its reference count is decremented. The new device is returned.
     */
-    SkDevice* setDevice(SkDevice* device);
+    virtual SkDevice* setDevice(SkDevice* device);
 
     /**
      *  saveLayer() can create another device (which is later drawn onto
@@ -99,7 +104,7 @@
      *  Shortcut for getDevice()->createCompatibleDevice(...).
      *  If getDevice() == NULL, this method does nothing, and returns NULL.
      */
-    SkDevice* createCompatibleDevice(SkBitmap::Config config, 
+    SkDevice* createCompatibleDevice(SkBitmap::Config config,
                                     int width, int height,
                                     bool isOpaque);
 
@@ -152,7 +157,7 @@
      *  kARGB_8888_Config as SkPMColor
      *
      *  If the bitmap has pixels already allocated, the canvas pixels will be
-     *  written there. If not, bitmap->allocPixels() will be called 
+     *  written there. If not, bitmap->allocPixels() will be called
      *  automatically. If the bitmap is backed by a texture readPixels will
      *  fail.
      *
@@ -290,7 +295,7 @@
     /** Returns true if drawing is currently going to a layer (from saveLayer)
      *  rather than to the root device.
      */
-    bool isDrawingToLayer() const;
+    virtual bool isDrawingToLayer() const;
 
     /** Preconcat the current matrix with the specified translation
         @param dx   The distance to translate in X
@@ -424,7 +429,16 @@
         @return true if the horizontal band is completely clipped out (i.e. does
                      not intersect the current clip)
     */
-    bool quickRejectY(SkScalar top, SkScalar bottom, EdgeType et) const;
+    bool quickRejectY(SkScalar top, SkScalar bottom, EdgeType et) const {
+        SkASSERT(SkScalarToCompareType(top) <= SkScalarToCompareType(bottom));
+        const SkRectCompareType& clipR = this->getLocalClipBoundsCompareType(et);
+        // In the case where the clip is empty and we are provided with a
+        // negative top and positive bottom parameter then this test will return
+        // false even though it will be clipped. We have chosen to exclude that
+        // check as it is rare and would result double the comparisons.
+        return SkScalarToCompareType(top) >= clipR.fBottom
+            || SkScalarToCompareType(bottom) <= clipR.fTop;
+    }
 
     /** Return the bounds of the current clip (in local coordinates) in the
         bounds parameter, and return true if it is non-empty. This can be useful
@@ -438,7 +452,7 @@
         then taking its bounds.
     */
     bool getClipDeviceBounds(SkIRect* bounds) const;
-       
+
 
     /** Fill the entire canvas' bitmap (restricted to the current clip) with the
         specified ARGB color, using the specified mode.
@@ -921,10 +935,21 @@
     };
 
 protected:
+    // Returns the canvas to be used by DrawIter. Default implementation
+    // returns this. Subclasses that encapsulate an indirect canvas may
+    // need to overload this method. The impl must keep track of this, as it
+    // is not released or deleted by the caller.
+    virtual SkCanvas* canvasForDrawIter();
+
     // all of the drawBitmap variants call this guy
     virtual void commonDrawBitmap(const SkBitmap&, const SkIRect*,
                                   const SkMatrix&, const SkPaint& paint);
 
+    // Clip rectangle bounds. Called internally by saveLayer.
+    // returns false if the entire rectangle is entirely clipped out
+    bool clipRectBounds(const SkRect* bounds, SaveFlags flags,
+                         SkIRect* intersection);
+
 private:
     class MCRec;
 
@@ -947,7 +972,7 @@
 
     friend class SkDrawIter;    // needs setupDrawForLayerDevice()
 
-    SkDevice* createLayerDevice(SkBitmap::Config, int width, int height, 
+    SkDevice* createLayerDevice(SkBitmap::Config, int width, int height,
                                 bool isOpaque);
 
     SkDevice* init(SkDevice*);
@@ -962,7 +987,7 @@
                                 const SkRect& dst, const SkPaint* paint);
     void internalDrawPaint(const SkPaint& paint);
 
-        
+
     void drawDevice(SkDevice*, int x, int y, const SkPaint*);
     // shared by save() and saveLayer()
     int internalSave(SaveFlags flags);
diff --git a/include/core/SkColorPriv.h b/include/core/SkColorPriv.h
index 714e845..e51b0b9 100644
--- a/include/core/SkColorPriv.h
+++ b/include/core/SkColorPriv.h
@@ -216,6 +216,57 @@
 }
 
 /**
+ * Abstract 4-byte interpolation, implemented on top of SkPMColor
+ * utility functions. Third parameter controls blending of the first two:
+ *   (src, dst, 0) returns dst
+ *   (src, dst, 0xFF) returns src
+ */
+static inline SkPMColor SkFourByteInterp(SkPMColor src, SkPMColor dst,
+                                         U8CPU srcWeight) {
+    unsigned scale = SkAlpha255To256(srcWeight);
+
+    unsigned a = SkAlphaBlend(SkGetPackedA32(src), SkGetPackedA32(dst), scale);
+    unsigned r = SkAlphaBlend(SkGetPackedR32(src), SkGetPackedR32(dst), scale);
+    unsigned g = SkAlphaBlend(SkGetPackedG32(src), SkGetPackedG32(dst), scale);
+    unsigned b = SkAlphaBlend(SkGetPackedB32(src), SkGetPackedB32(dst), scale);
+
+    return SkPackARGB32(a, r, g, b);
+}
+
+/**
+ * 32b optimized version; currently appears to be 10% faster even on 64b
+ * architectures than an equivalent 64b version and 30% faster than
+ * SkFourByteInterp(). Third parameter controls blending of the first two:
+ *   (src, dst, 0) returns dst
+ *   (src, dst, 0xFF) returns src
+ * ** Does not match the results of SkFourByteInterp() because we use
+ * a more accurate scale computation!
+ * TODO: migrate Skia function to using an accurate 255->266 alpha
+ * conversion.
+ */
+static inline SkPMColor SkFastFourByteInterp(SkPMColor src,
+                                             SkPMColor dst,
+                                             U8CPU srcWeight) {
+    SkASSERT(srcWeight < 256);
+
+    // Reorders ARGB to AG-RB in order to reduce the number of operations.
+    const uint32_t mask = 0xFF00FF;
+    uint32_t src_rb = src & mask;
+    uint32_t src_ag = (src >> 8) & mask;
+    uint32_t dst_rb = dst & mask;
+    uint32_t dst_ag = (dst >> 8) & mask;
+
+    // scale = srcWeight + (srcWeight >> 7) is more accurate than
+    // scale = srcWeight + 1, but 7% slower
+    int scale = srcWeight + (srcWeight >> 7);
+
+    uint32_t ret_rb = src_rb * scale + (256 - scale) * dst_rb;
+    uint32_t ret_ag = src_ag * scale + (256 - scale) * dst_ag;
+
+    return (ret_ag & ~mask) | ((ret_rb & ~mask) >> 8);
+}
+
+/**
  *  Same as SkPackARGB32, but this version guarantees to not check that the
  *  values are premultiplied in the debug version.
  */
@@ -663,5 +714,116 @@
 // used for cheap 2x2 dithering when the colors are opaque
 void sk_dither_memset16(uint16_t dst[], uint16_t value, uint16_t other, int n);
 
+///////////////////////////////////////////////////////////////////////////////
+
+static inline int SkUpscale31To32(int value) {
+    SkASSERT((unsigned)value <= 31);
+    return value + (value >> 4);
+}
+
+static inline int SkBlend32(int src, int dst, int scale) {
+    SkASSERT((unsigned)src <= 0xFF);
+    SkASSERT((unsigned)dst <= 0xFF);
+    SkASSERT((unsigned)scale <= 32);
+    return dst + ((src - dst) * scale >> 5);
+}
+
+static inline SkPMColor SkBlendLCD16(int srcA, int srcR, int srcG, int srcB,
+                                     SkPMColor dst, uint16_t mask) { 
+    if (mask == 0) {
+        return dst;
+    }
+        
+    /*  We want all of these in 5bits, hence the shifts in case one of them
+     *  (green) is 6bits.
+     */
+    int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
+    int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
+    int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
+        
+    // Now upscale them to 0..32, so we can use blend32
+    maskR = SkUpscale31To32(maskR);
+    maskG = SkUpscale31To32(maskG);
+    maskB = SkUpscale31To32(maskB);
+     
+    // srcA has been upscaled to 256 before passed into this function
+    maskR = maskR * srcA >> 8;
+    maskG = maskG * srcA >> 8;
+    maskB = maskB * srcA >> 8;
+        
+    int dstR = SkGetPackedR32(dst);
+    int dstG = SkGetPackedG32(dst);
+    int dstB = SkGetPackedB32(dst);
+        
+    // LCD blitting is only supported if the dst is known/required
+    // to be opaque
+    return SkPackARGB32(0xFF,
+                        SkBlend32(srcR, dstR, maskR),
+                        SkBlend32(srcG, dstG, maskG),
+                        SkBlend32(srcB, dstB, maskB));
+}
+
+static inline SkPMColor SkBlendLCD16Opaque(int srcR, int srcG, int srcB,
+                                           SkPMColor dst, uint16_t mask,
+                                           SkPMColor opaqueDst) { 
+    if (mask == 0) {
+        return dst;
+    }
+
+    if (0xFFFF == mask) {
+        return opaqueDst;
+    }
+        
+    /*  We want all of these in 5bits, hence the shifts in case one of them
+     *  (green) is 6bits.
+     */
+    int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
+    int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
+    int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
+        
+    // Now upscale them to 0..32, so we can use blend32
+    maskR = SkUpscale31To32(maskR);
+    maskG = SkUpscale31To32(maskG);
+    maskB = SkUpscale31To32(maskB);
+        
+    int dstR = SkGetPackedR32(dst);
+    int dstG = SkGetPackedG32(dst);
+    int dstB = SkGetPackedB32(dst);
+        
+    // LCD blitting is only supported if the dst is known/required
+    // to be opaque
+    return SkPackARGB32(0xFF,
+                        SkBlend32(srcR, dstR, maskR),
+                        SkBlend32(srcG, dstG, maskG),
+                        SkBlend32(srcB, dstB, maskB));
+}
+
+static inline void SkBlitLCD16Row(SkPMColor dst[], const uint16_t src[],
+                                  SkColor color, int width, SkPMColor) {
+    int srcA = SkColorGetA(color);
+    int srcR = SkColorGetR(color);
+    int srcG = SkColorGetG(color);
+    int srcB = SkColorGetB(color);
+    
+    srcA = SkAlpha255To256(srcA);
+    
+    for (int i = 0; i < width; i++) {
+        dst[i] = SkBlendLCD16(srcA, srcR, srcG, srcB, dst[i], src[i]);
+    }
+}
+
+static inline void SkBlitLCD16OpaqueRow(SkPMColor dst[], const uint16_t src[],
+                                        SkColor color, int width, 
+                                        SkPMColor opaqueDst) {
+    int srcR = SkColorGetR(color);
+    int srcG = SkColorGetG(color);
+    int srcB = SkColorGetB(color);
+    
+    for (int i = 0; i < width; i++) {
+        dst[i] = SkBlendLCD16Opaque(srcR, srcG, srcB, dst[i], src[i],
+                                    opaqueDst); 
+    }
+}
+
 #endif
 
diff --git a/include/core/SkDevice.h b/include/core/SkDevice.h
index c026a4b..3303981 100644
--- a/include/core/SkDevice.h
+++ b/include/core/SkDevice.h
@@ -62,7 +62,7 @@
      *                  draw into this device such that all of the pixels will
      *                  be opaque.
      */
-    SkDevice* createCompatibleDevice(SkBitmap::Config config, 
+    SkDevice* createCompatibleDevice(SkBitmap::Config config,
                                      int width, int height,
                                      bool isOpaque);
 
@@ -258,7 +258,7 @@
      *  kARGB_8888_Config as SkPMColor
      *
      *  If the bitmap has pixels already allocated, the device pixels will be
-     *  written there. If not, bitmap->allocPixels() will be called 
+     *  written there. If not, bitmap->allocPixels() will be called
      *  automatically. If the bitmap is backed by a texture readPixels will
      *  fail.
      *
@@ -279,11 +279,14 @@
 
     ///////////////////////////////////////////////////////////////////////////
 
-    /** Update as needed the pixel value in the bitmap, so that the caller can access
-        the pixels directly. Note: only the pixels field should be altered. The config/width/height/rowbytes
-        must remain unchanged.
+    /** Update as needed the pixel value in the bitmap, so that the caller can
+        access the pixels directly. Note: only the pixels field should be
+        altered. The config/width/height/rowbytes must remain unchanged.
+        @param bitmap The device's bitmap
+        @return Echo the bitmap parameter, or an alternate (shadow) bitmap 
+            maintained by the subclass.
     */
-    virtual void onAccessBitmap(SkBitmap*);
+    virtual const SkBitmap& onAccessBitmap(SkBitmap*);
 
     SkPixelRef* getPixelRef() const { return fBitmap.pixelRef(); }
     // just for subclasses, to assign a custom pixelref
@@ -291,7 +294,7 @@
         fBitmap.setPixelRef(pr, offset);
         return pr;
     }
-    
+
     /**
      * Implements readPixels API. The caller will ensure that:
      *  1. bitmap has pixel config kARGB_8888_Config.
@@ -327,7 +330,7 @@
                              const SkMatrix& ctm,
                              SkBitmap* result, SkIPoint* offset);
 
-    // This is equal kBGRA_Premul_Config8888 or kRGBA_Premul_Config8888 if 
+    // This is equal kBGRA_Premul_Config8888 or kRGBA_Premul_Config8888 if
     // either is identical to kNative_Premul_Config8888. Otherwise, -1.
     static const SkCanvas::Config8888 kPMColorAlias;
 
@@ -342,15 +345,15 @@
     // just called by SkCanvas when built as a layer
     void setOrigin(int x, int y) { fOrigin.set(x, y); }
     // just called by SkCanvas for saveLayer
-    SkDevice* createCompatibleDeviceForSaveLayer(SkBitmap::Config config, 
+    SkDevice* createCompatibleDeviceForSaveLayer(SkBitmap::Config config,
                                                  int width, int height,
                                                  bool isOpaque);
 
     /**
      * Subclasses should override this to implement createCompatibleDevice.
      */
-    virtual SkDevice* onCreateCompatibleDevice(SkBitmap::Config config, 
-                                               int width, int height, 
+    virtual SkDevice* onCreateCompatibleDevice(SkBitmap::Config config,
+                                               int width, int height,
                                                bool isOpaque,
                                                Usage usage);
 
diff --git a/include/core/SkDeviceProfile.h b/include/core/SkDeviceProfile.h
new file mode 100644
index 0000000..46b9781
--- /dev/null
+++ b/include/core/SkDeviceProfile.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2012 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkDeviceProfile_DEFINED
+#define SkDeviceProfile_DEFINED
+
+#include "SkRefCnt.h"
+
+class SkDeviceProfile : public SkRefCnt {
+public:
+    enum LCDConfig {
+        kNone_LCDConfig,   // disables LCD text rendering, uses A8 instead
+        kRGB_Horizontal_LCDConfig,
+        kBGR_Horizontal_LCDConfig,
+        kRGB_Vertical_LCDConfig,
+        kBGR_Vertical_LCDConfig,
+    };
+
+    enum FontHintLevel {
+        kNone_FontHintLevel,
+        kSlight_FontHintLevel,
+        kNormal_FontHintLevel,
+        kFull_FontHintLevel,
+        kAuto_FontHintLevel,
+    };
+
+    /**
+     *  gammaExp is typically between 1.0 and 2.2. For no gamma adjustment,
+     *  specify 1.0
+     *
+     *  contrastScale will be pinned between 0.0 and 1.0. For no contrast
+     *  adjustment, specify 0.0
+     *
+     *  @param config   Describes the LCD layout for this device. If this is set
+     *                  to kNone, then all requests for LCD text will be
+     *                  devolved to A8 antialiasing.
+     *
+     *  @param level    The hinting level to be used, IF the paint specifies
+     *                  "default". Otherwise the paint's hinting level will be
+     *                  respected.
+     */
+    static SkDeviceProfile* Create(float gammaExp,
+                                   float contrastScale,
+                                   LCDConfig,
+                                   FontHintLevel);
+
+    /**
+     *  Returns the global default profile, that is used if no global profile is
+     *  specified with SetGlobal(), or if NULL is specified to SetGlobal().
+     *  The references count is *not* incremented, and the caller should not
+     *  call unref().
+     */
+    static SkDeviceProfile* GetDefault();
+
+    /**
+     *  Return the current global profile (or the default if no global had yet
+     *  been set) and increment its reference count. The call *must* call unref()
+     *  when it is done using it.
+     */
+    static SkDeviceProfile* RefGlobal();
+
+    /**
+     *  Make the specified profile be the global value for all subsequently
+     *  instantiated devices. Does not affect any existing devices.
+     *  Increments the reference count on the profile.
+     *  Specify NULL for the "identity" profile (where there is no gamma or
+     *  contrast correction).
+     */
+    static void SetGlobal(SkDeviceProfile*);
+
+    float getFontGammaExponent() const { return fGammaExponent; }
+    float getFontContrastScale() const { return fContrastScale; }
+
+    /**
+     *  Given a luminance byte (0 for black, 0xFF for white), generate a table
+     *  that applies the gamma/contrast settings to linear coverage values.
+     */
+    void generateTableForLuminanceByte(U8CPU lumByte, uint8_t table[256]) const;
+
+private:
+    SkDeviceProfile(float gammaExp, float contrastScale, LCDConfig,
+                    FontHintLevel);
+
+    float           fGammaExponent;
+    float           fContrastScale;
+    LCDConfig       fLCDConfig;
+    FontHintLevel   fFontHintLevel;
+};
+
+#endif
+
diff --git a/include/core/SkEndian.h b/include/core/SkEndian.h
index 3eb67da..910cf1e 100644
--- a/include/core/SkEndian.h
+++ b/include/core/SkEndian.h
@@ -31,8 +31,11 @@
 */
 static inline uint16_t SkEndianSwap16(U16CPU value) {
     SkASSERT(value == (uint16_t)value);
-    return (uint16_t)((value >> 8) | (value << 8));
+    return static_cast<uint16_t>((value >> 8) | (value << 8));
 }
+template<uint16_t N> struct SkTEndianSwap16 {
+    static const uint16_t value = static_cast<uint16_t>((N >> 8) | ((N & 0xFF) << 8));
+};
 
 /** Vector version of SkEndianSwap16(), which swaps the
     low two bytes of each value in the array.
@@ -55,6 +58,12 @@
             ((value & 0xFF0000) >> 8) |
             (value >> 24);
 }
+template<uint32_t N> struct SkTEndianSwap32 {
+    static const uint32_t value = ((N & 0xFF) << 24) |
+                                  ((N & 0xFF00) << 8) |
+                                  ((N & 0xFF0000) >> 8) |
+                                  (N >> 24);
+};
 
 /** Vector version of SkEndianSwap16(), which swaps the
     bytes of each value in the array.
@@ -73,11 +82,21 @@
     #define SkEndian_SwapBE32(n)    SkEndianSwap32(n)
     #define SkEndian_SwapLE16(n)    (n)
     #define SkEndian_SwapLE32(n)    (n)
+
+    #define SkTEndian_SwapBE16(n)    SkTEndianSwap16<n>::value
+    #define SkTEndian_SwapBE32(n)    SkTEndianSwap32<n>::value
+    #define SkTEndian_SwapLE16(n)    (n)
+    #define SkTEndian_SwapLE32(n)    (n)
 #else   // SK_CPU_BENDIAN
     #define SkEndian_SwapBE16(n)    (n)
     #define SkEndian_SwapBE32(n)    (n)
     #define SkEndian_SwapLE16(n)    SkEndianSwap16(n)
     #define SkEndian_SwapLE32(n)    SkEndianSwap32(n)
+
+    #define SkTEndian_SwapBE16(n)    (n)
+    #define SkTEndian_SwapBE32(n)    (n)
+    #define SkTEndian_SwapLE16(n)    SkTEndianSwap16<n>::value
+    #define SkTEndian_SwapLE32(n)    SkTEndianSwap32<n>::value
 #endif
 
 // When a bytestream is embedded in a 32-bit word, how far we need to
@@ -94,5 +113,40 @@
     #define SkEndian_Byte3Shift 0
 #endif
 
+
+#if defined(SK_UINT8_BITFIELD_LENDIAN) && defined(SK_UINT8_BITFIELD_BENDIAN)
+    #error "can't have both bitfield LENDIAN and BENDIAN defined"
+#endif
+
+#if !defined(SK_UINT8_BITFIELD_LENDIAN) && !defined(SK_UINT8_BITFIELD_BENDIAN)
+    #ifdef SK_CPU_LENDIAN
+        #define SK_UINT8_BITFIELD_LENDIAN
+    #else
+        #define SK_UINT8_BITFIELD_BENDIAN
+    #endif
+#endif
+
+#ifdef SK_UINT8_BITFIELD_LENDIAN
+    #define SK_UINT8_BITFIELD(f0, f1, f2, f3, f4, f5, f6, f7) \
+        SK_OT_BYTE f0 : 1; \
+        SK_OT_BYTE f1 : 1; \
+        SK_OT_BYTE f2 : 1; \
+        SK_OT_BYTE f3 : 1; \
+        SK_OT_BYTE f4 : 1; \
+        SK_OT_BYTE f5 : 1; \
+        SK_OT_BYTE f6 : 1; \
+        SK_OT_BYTE f7 : 1;
+#else
+    #define SK_UINT8_BITFIELD(f0, f1, f2, f3, f4, f5, f6, f7) \
+        SK_OT_BYTE f7 : 1; \
+        SK_OT_BYTE f6 : 1; \
+        SK_OT_BYTE f5 : 1; \
+        SK_OT_BYTE f4 : 1; \
+        SK_OT_BYTE f3 : 1; \
+        SK_OT_BYTE f2 : 1; \
+        SK_OT_BYTE f1 : 1; \
+        SK_OT_BYTE f0 : 1;
+#endif
+
 #endif
 
diff --git a/include/core/SkFlattenable.h b/include/core/SkFlattenable.h
index a66638e..34ca34e 100644
--- a/include/core/SkFlattenable.h
+++ b/include/core/SkFlattenable.h
@@ -161,6 +161,9 @@
     void* readFunctionPtr();
     SkFlattenable* readFlattenable();
     
+    void setPictureVersion(uint32_t version) { fPictureVersion = version; }
+    uint32_t getPictureVersion() { return fPictureVersion; }
+
 private:
     SkRefCnt** fRCArray;
     int        fRCCount;
@@ -172,6 +175,8 @@
     SkFlattenable::Factory* fFactoryArray;
     int                     fFactoryCount;
     
+    uint32_t fPictureVersion;
+
     typedef SkReader32 INHERITED;
 };
 
diff --git a/include/core/SkFontHost.h b/include/core/SkFontHost.h
index c549519..25c9ecb 100644
--- a/include/core/SkFontHost.h
+++ b/include/core/SkFontHost.h
@@ -84,12 +84,6 @@
 
     ///////////////////////////////////////////////////////////////////////////
 
-    /** Returns true if the specified unique ID matches an existing font.
-        Returning false is similar to calling OpenStream with an invalid ID,
-        which will return NULL in that case.
-    */
-    static bool ValidFontID(SkFontID uniqueID);
-
     /** Return a new stream to read the font data, or null if the uniqueID does
         not match an existing typeface. .The caller must call stream->unref()
         when it is finished reading the data.
diff --git a/include/core/SkImageFilter.h b/include/core/SkImageFilter.h
index 22b9569..7d7c140 100644
--- a/include/core/SkImageFilter.h
+++ b/include/core/SkImageFilter.h
@@ -80,6 +80,22 @@
      */
     virtual bool asABlur(SkSize* sigma) const;
 
+    /**
+     *  Experimental.
+     *
+     *  If the filter can be expressed as an erode, return true and
+     *  set the radius in X and Y.
+     */
+    virtual bool asAnErode(SkISize* radius) const;
+
+    /**
+     *  Experimental.
+     *
+     *  If the filter can be expressed as a dilation, return true and
+     *  set the radius in X and Y.
+     */
+    virtual bool asADilate(SkISize* radius) const;
+
 protected:
     SkImageFilter() {}
     explicit SkImageFilter(SkFlattenableReadBuffer& rb) : INHERITED(rb) {}
diff --git a/include/core/SkPaint.h b/include/core/SkPaint.h
index 445f7eb..31bc30b 100644
--- a/include/core/SkPaint.h
+++ b/include/core/SkPaint.h
@@ -100,11 +100,12 @@
         kEmbeddedBitmapText_Flag = 0x400, //!< mask to enable embedded bitmap strikes
         kAutoHinting_Flag     = 0x800,  //!< mask to force Freetype's autohinter
         kVerticalText_Flag    = 0x1000,
+        kGenA8FromLCD_Flag    = 0x2000, // hack for GDI -- do not use if you can help it
 
         // when adding extra flags, note that the fFlags member is specified
         // with a bit-width and you'll have to expand it.
 
-        kAllFlags = 0x1FFF
+        kAllFlags = 0x3FFF
     };
 
     /** Return the paint's flags. Use the Flag enum to test flag values.
@@ -880,7 +881,7 @@
     SkColor         fColor;
     SkScalar        fWidth;
     SkScalar        fMiterLimit;
-    unsigned        fFlags : 14;
+    unsigned        fFlags : 15;
     unsigned        fTextAlign : 2;
     unsigned        fCapType : 2;
     unsigned        fJoinType : 2;
diff --git a/include/core/SkPath.h b/include/core/SkPath.h
index 859486c..957d50e 100644
--- a/include/core/SkPath.h
+++ b/include/core/SkPath.h
@@ -732,6 +732,8 @@
 
 #ifdef SK_BUILD_FOR_ANDROID
     uint32_t getGenerationID() const;
+    const SkPath* getSourcePath() const;
+    void setSourcePath(const SkPath* path);
 #endif
 
     SkDEBUGCODE(void validate() const;)
@@ -740,19 +742,20 @@
     SkTDArray<SkPoint>  fPts;
     SkTDArray<uint8_t>  fVerbs;
     mutable SkRect      fBounds;
+    int                 fLastMoveToIndex;
     uint8_t             fFillType;
     uint8_t             fSegmentMask;
     mutable uint8_t     fBoundsIsDirty;
     mutable uint8_t     fConvexity;
 #ifdef SK_BUILD_FOR_ANDROID
     uint32_t            fGenerationID;
+    const SkPath*       fSourcePath;
 #endif
 
     // called, if dirty, by getBounds()
     void computeBounds() const;
 
     friend class Iter;
-    void cons_moveto();
 
     friend class SkPathStroker;
     /*  Append the first contour of path, ignoring path's initial point. If no
@@ -767,7 +770,14 @@
     */
     void reversePathTo(const SkPath&);
 
-    friend const SkPoint* sk_get_path_points(const SkPath&, int index);
+    // called before we add points for lineTo, quadTo, cubicTo, checking to see
+    // if we need to inject a leading moveTo first
+    //
+    //  SkPath path; path.lineTo(...);   <--- need a leading moveTo(0, 0)
+    // SkPath path; ... path.close(); path.lineTo(...) <-- need a moveTo(previous moveTo)
+    //
+    inline void injectMoveToIfNeeded();
+
     friend class SkAutoPathBoundsUpdate;
 };
 
diff --git a/include/core/SkPathMeasure.h b/include/core/SkPathMeasure.h
index ec43834..6fb4482 100644
--- a/include/core/SkPathMeasure.h
+++ b/include/core/SkPathMeasure.h
@@ -85,13 +85,14 @@
 
     struct Segment {
         SkScalar    fDistance;  // total distance up to this point
-        unsigned    fPtIndex : 15;
+        unsigned    fPtIndex : 15; // index into the fPts array
         unsigned    fTValue : 15;
         unsigned    fType : 2;
 
         SkScalar getScalarT() const;
     };
     SkTDArray<Segment>  fSegments;
+    SkTDArray<SkPoint>  fPts; // Points used to define the segments
 
     static const Segment* NextSegment(const Segment*);
 
@@ -104,4 +105,3 @@
 };
 
 #endif
-
diff --git a/include/core/SkPixelRef.h b/include/core/SkPixelRef.h
index e247479..d5f6ab2 100644
--- a/include/core/SkPixelRef.h
+++ b/include/core/SkPixelRef.h
@@ -50,9 +50,9 @@
 
     This class can be shared/accessed between multiple threads.
 */
-class SkPixelRef : public SkRefCnt {
+class SK_API SkPixelRef : public SkRefCnt {
 public:
-    explicit SkPixelRef(SkMutex* mutex = NULL);
+    explicit SkPixelRef(SkBaseMutex* mutex = NULL);
 
     /** Return the pixel memory returned from lockPixels, or null if the
         lockCount is 0.
@@ -201,16 +201,16 @@
     /** Return the mutex associated with this pixelref. This value is assigned
         in the constructor, and cannot change during the lifetime of the object.
     */
-    SkMutex* mutex() const { return fMutex; }
+    SkBaseMutex* mutex() const { return fMutex; }
 
-    SkPixelRef(SkFlattenableReadBuffer&, SkMutex*);
+    SkPixelRef(SkFlattenableReadBuffer&, SkBaseMutex*);
 
 private:
 #if !SK_ALLOW_STATIC_GLOBAL_INITIALIZERS
     static void InitializeFlattenables();
 #endif
 
-    SkMutex*        fMutex; // must remain in scope for the life of this object
+    SkBaseMutex*    fMutex; // must remain in scope for the life of this object
     void*           fPixels;
     SkColorTable*   fColorTable;    // we do not track ownership, subclass does
     int             fLockCount;
diff --git a/include/core/SkPoint.h b/include/core/SkPoint.h
index de7c0ef..d371e64 100644
--- a/include/core/SkPoint.h
+++ b/include/core/SkPoint.h
@@ -442,11 +442,11 @@
     void setOrthog(const SkPoint& vec, Side side = kLeft_Side) {
         // vec could be this
         SkScalar tmp = vec.fX;
-        if (kLeft_Side == side) {
+        if (kRight_Side == side) {
             fX = -vec.fY;
             fY = tmp;
         } else {
-            SkASSERT(kRight_Side == side);
+            SkASSERT(kLeft_Side == side);
             fX = vec.fY;
             fY = -tmp;
         }
diff --git a/include/core/SkRegion.h b/include/core/SkRegion.h
index 9d89d94..7623b82 100644
--- a/include/core/SkRegion.h
+++ b/include/core/SkRegion.h
@@ -45,13 +45,13 @@
      *  Return true if the two regions are equal. i.e. The enclose exactly
      *  the same area.
      */
-    friend bool operator==(const SkRegion& a, const SkRegion& b);
+    bool operator==(const SkRegion& other) const;
 
     /**
      *  Return true if the two regions are not equal.
      */
-    friend bool operator!=(const SkRegion& a, const SkRegion& b) {
-        return !(a == b);
+    bool operator!=(const SkRegion& other) const {
+        return !(*this == other);
     }
     
     /**
diff --git a/include/core/SkScalerContext.h b/include/core/SkScalerContext.h
index e7dd7d4..29679d6 100644
--- a/include/core/SkScalerContext.h
+++ b/include/core/SkScalerContext.h
@@ -16,6 +16,8 @@
 #include "SkPath.h"
 #include "SkPoint.h"
 
+//#define SK_USE_COLOR_LUMINANCE
+
 class SkDescriptor;
 class SkMaskFilter;
 class SkPathEffect;
@@ -175,16 +177,27 @@
         kLCD_Vertical_Flag        = 0x0200,    // else Horizontal
         kLCD_BGROrder_Flag        = 0x0400,    // else RGB order
 
+        // Generate A8 from LCD source (for GDI), only meaningful if fMaskFormat is kA8
+        // Perhaps we can store this (instead) in fMaskFormat, in hight bit?
+        kGenA8FromLCD_Flag        = 0x0800,
+
+#ifdef SK_USE_COLOR_LUMINANCE
+        kLuminance_Bits           = 3,
+#else
         // luminance : 0 for black text, kLuminance_Max for white text
-        kLuminance_Shift          = 11, // to shift into the other flags above
+        kLuminance_Shift          = 13, // shift to land in the high 3-bits of Flags
         kLuminance_Bits           = 3,  // ensure Flags doesn't exceed 16bits
+#endif
     };
     
     // computed values
     enum {
         kHinting_Mask   = kHintingBit1_Flag | kHintingBit2_Flag,
+#ifdef SK_USE_COLOR_LUMINANCE
+#else
         kLuminance_Max  = (1 << kLuminance_Bits) - 1,
         kLuminance_Mask = kLuminance_Max << kLuminance_Shift,
+#endif
     };
 
     struct Rec {
@@ -193,6 +206,9 @@
         SkScalar    fTextSize, fPreScaleX, fPreSkewX;
         SkScalar    fPost2x2[2][2];
         SkScalar    fFrameWidth, fMiterLimit;
+#ifdef SK_USE_COLOR_LUMINANCE
+        uint32_t    fLumBits;
+#endif
         uint8_t     fMaskFormat;
         uint8_t     fStrokeJoin;
         uint16_t    fFlags;
@@ -213,7 +229,20 @@
         void setHinting(SkPaint::Hinting hinting) {
             fFlags = (fFlags & ~kHinting_Mask) | (hinting << kHinting_Shift);
         }
-
+        
+        SkMask::Format getFormat() const {
+            return static_cast<SkMask::Format>(fMaskFormat);
+        }
+        
+#ifdef SK_USE_COLOR_LUMINANCE
+        SkColor getLuminanceColor() const {
+            return fLumBits;
+        }
+        
+        void setLuminanceColor(SkColor c) {
+            fLumBits = c;
+        }
+#else
         unsigned getLuminanceBits() const {
             return (fFlags & kLuminance_Mask) >> kLuminance_Shift;
         }
@@ -230,10 +259,7 @@
             lum |= (lum << kLuminance_Bits*2);
             return lum >> (4*kLuminance_Bits - 8);
         }
-
-        SkMask::Format getFormat() const {
-            return static_cast<SkMask::Format>(fMaskFormat);
-        }
+#endif
     };
 
     SkScalerContext(const SkDescriptor* desc);
@@ -277,6 +303,8 @@
 #endif
 
     static inline void MakeRec(const SkPaint&, const SkMatrix*, Rec* rec);
+    static inline void PostMakeRec(Rec*);
+
     static SkScalerContext* Create(const SkDescriptor*);
 
 protected:
diff --git a/include/core/SkStream.h b/include/core/SkStream.h
index 90d2357..67512d7 100644
--- a/include/core/SkStream.h
+++ b/include/core/SkStream.h
@@ -272,7 +272,8 @@
 public:
     SkMemoryWStream(void* buffer, size_t size);
     virtual bool write(const void* buffer, size_t size) SK_OVERRIDE;
-    
+    size_t bytesWritten() const { return fBytesWritten; }
+
 private:
     char*   fBuffer;
     size_t  fMaxLength;
diff --git a/include/core/SkStroke.h b/include/core/SkStroke.h
index d055b83..e5d69c4 100644
--- a/include/core/SkStroke.h
+++ b/include/core/SkStroke.h
@@ -16,10 +16,6 @@
 struct SkRect;
 class SkPath;
 
-#define SK_DefaultStrokeWidth       SK_Scalar1
-#define SK_DefaultMiterLimit        SkIntToScalar(4)
-
-
 /** \class SkStroke
     SkStroke is the utility class that constructs paths by stroking
     geometries (lines, rects, ovals, roundrects, paths). This is
diff --git a/include/core/SkThread.h b/include/core/SkThread.h
index 5f2da4a..1495a16 100644
--- a/include/core/SkThread.h
+++ b/include/core/SkThread.h
@@ -31,7 +31,7 @@
 
 class SkAutoMutexAcquire : SkNoncopyable {
 public:
-    explicit SkAutoMutexAcquire(SkMutex& mutex) : fMutex(&mutex)
+    explicit SkAutoMutexAcquire(SkBaseMutex& mutex) : fMutex(&mutex)
     {
         SkASSERT(fMutex != NULL);
         mutex.acquire();
@@ -55,7 +55,7 @@
     }
         
 private:
-    SkMutex* fMutex;
+    SkBaseMutex* fMutex;
 };
 
 #endif
diff --git a/include/core/SkThread_platform.h b/include/core/SkThread_platform.h
index d83f3ed..863f6e3 100644
--- a/include/core/SkThread_platform.h
+++ b/include/core/SkThread_platform.h
@@ -53,38 +53,44 @@
 
 #endif // !SK_BUILD_FOR_ANDROID
 
-#if defined(SK_BUILD_FOR_ANDROID) && !defined(SK_BUILD_FOR_ANDROID_NDK)
+#ifdef SK_USE_POSIX_THREADS
 
-#include <utils/threads.h>
+#include <pthread.h>
 
-class SkMutex : android::Mutex {
-public:
-    // if isGlobal is true, then ignore any errors in the platform-specific
-    // destructor
-    SkMutex(bool isGlobal = true) {}
-    ~SkMutex() {}
-
-    void    acquire() { this->lock(); }
-    void    release() { this->unlock(); }
+// A SkBaseMutex is a POD structure that can be directly initialized
+// at declaration time with SK_DECLARE_STATIC/GLOBAL_MUTEX. This avoids the
+// generation of a static initializer in the final machine code (and
+// a corresponding static finalizer).
+//
+struct SkBaseMutex {
+    void    acquire() { pthread_mutex_lock(&fMutex); }
+    void    release() { pthread_mutex_unlock(&fMutex); }
+    pthread_mutex_t  fMutex;
 };
 
-#else
+// Using POD-style initialization prevents the generation of a static initializer
+// and keeps the acquire() implementation small and fast.
+#define SK_DECLARE_STATIC_MUTEX(name)   static SkBaseMutex  name = { PTHREAD_MUTEX_INITIALIZER }
 
-/** Implemented by the porting layer, this function adds 1 to the int specified
-    by the address (in a thread-safe manner), and returns the previous value.
-*/
-SK_API int32_t sk_atomic_inc(int32_t* addr);
-/** Implemented by the porting layer, this function subtracts 1 to the int
-    specified by the address (in a thread-safe manner), and returns the previous
-    value.
-*/
-SK_API int32_t sk_atomic_dec(int32_t* addr);
+// Special case used when the static mutex must be available globally.
+#define SK_DECLARE_GLOBAL_MUTEX(name)   SkBaseMutex  name = { PTHREAD_MUTEX_INITIALIZER }
 
-class SkMutex {
+// A normal mutex that requires to be initialized through normal C++ construction,
+// i.e. when it's a member of another class, or allocated on the heap.
+class SkMutex : public SkBaseMutex, SkNoncopyable {
 public:
-    // if isGlobal is true, then ignore any errors in the platform-specific
-    // destructor
-    SkMutex(bool isGlobal = true);
+    SkMutex();
+    ~SkMutex();
+};
+
+#else // !SK_USE_POSIX_THREADS
+
+// In the generic case, SkBaseMutex and SkMutex are the same thing, and we
+// can't easily get rid of static initializers.
+//
+class SkMutex : SkNoncopyable {
+public:
+    SkMutex();
     ~SkMutex();
 
     void    acquire();
@@ -98,6 +104,12 @@
     uint32_t    fStorage[kStorageIntCount];
 };
 
-#endif
+typedef SkMutex SkBaseMutex;
+
+#define SK_DECLARE_STATIC_MUTEX(name)  static SkBaseMutex  name
+#define SK_DECLARE_GLOBAL_MUTEX(name)  SkBaseMutex  name
+
+#endif // !SK_USE_POSIX_THREADS
+
 
 #endif
diff --git a/include/core/SkUserConfig.h b/include/core/SkUserConfig.h
index b409e82..ef75114 100644
--- a/include/core/SkUserConfig.h
+++ b/include/core/SkUserConfig.h
@@ -41,11 +41,16 @@
 // ANDROID Specific changes - NO NOT CHECK BACK INTO code.google.com/p/skia
 //
 
+#define PICTURE_VERSION_ICS 1 // r1562 of Skia
+#define PICTURE_VERSION_JB  2
+
 // do this build check for other tools that still read this header
 #ifdef ANDROID
     #include <utils/misc.h>
 #endif
 
+#define SK_USE_POSIX_THREADS
+
 /*  Scalars (the fractional value type in skia) can be implemented either as
     floats or 16.16 integers (fixed). Exactly one of these two symbols must be
     defined.
@@ -111,7 +116,7 @@
     printf conventions (e.g. const char* format, ...). If you want to redirect
     this to something other than printf, define yours here
  */
-//#define SkDebugf(...) MyFunction(__VA_ARGS__)
+//#define SkDebugf(...)  MyFunction(__VA_ARGS__)
 
 /*
  *  To specify a different default font cache limit, define this. If this is
diff --git a/include/effects/SkMorphologyImageFilter.h b/include/effects/SkMorphologyImageFilter.h
new file mode 100644
index 0000000..2297938
--- /dev/null
+++ b/include/effects/SkMorphologyImageFilter.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2012 The Android Open Source Project
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+
+#ifndef SkMorphologyImageFilter_DEFINED
+#define SkMorphologyImageFilter_DEFINED
+
+#include "SkImageFilter.h"
+
+class SK_API SkMorphologyImageFilter : public SkImageFilter {
+public:
+    explicit SkMorphologyImageFilter(SkFlattenableReadBuffer& buffer);
+    SkMorphologyImageFilter(int radiusX, int radiusY);
+
+protected:
+    virtual void flatten(SkFlattenableWriteBuffer& buffer) SK_OVERRIDE;
+    SkISize    radius() const { return fRadius; }
+
+private:
+    SkISize    fRadius;
+    typedef SkImageFilter INHERITED;
+};
+
+class SK_API SkDilateImageFilter : public SkMorphologyImageFilter {
+public:
+    SkDilateImageFilter(int radiusX, int radiusY) : INHERITED(radiusX, radiusY) {}
+    explicit SkDilateImageFilter(SkFlattenableReadBuffer& buffer) : INHERITED(buffer) {}
+
+    virtual bool asADilate(SkISize* radius) const SK_OVERRIDE;
+    virtual bool onFilterImage(Proxy*, const SkBitmap& src, const SkMatrix&,
+                               SkBitmap* result, SkIPoint* offset) SK_OVERRIDE;
+    static SkFlattenable* CreateProc(SkFlattenableReadBuffer& buffer) {
+        return SkNEW_ARGS(SkDilateImageFilter, (buffer));
+    }
+    virtual Factory getFactory() SK_OVERRIDE { return CreateProc; }
+    SK_DECLARE_FLATTENABLE_REGISTRAR()
+
+    typedef SkMorphologyImageFilter INHERITED;
+};
+
+class SK_API SkErodeImageFilter : public SkMorphologyImageFilter {
+public:
+    SkErodeImageFilter(int radiusX, int radiusY) : INHERITED(radiusX, radiusY) {}
+    explicit SkErodeImageFilter(SkFlattenableReadBuffer& buffer) : INHERITED(buffer) {}
+
+    virtual bool asAnErode(SkISize* radius) const SK_OVERRIDE;
+    virtual bool onFilterImage(Proxy*, const SkBitmap& src, const SkMatrix&,
+                               SkBitmap* result, SkIPoint* offset) SK_OVERRIDE;
+
+    static SkFlattenable* CreateProc(SkFlattenableReadBuffer& buffer) {
+        return SkNEW_ARGS(SkErodeImageFilter, (buffer));
+    }
+    virtual Factory getFactory() SK_OVERRIDE { return CreateProc; }
+    SK_DECLARE_FLATTENABLE_REGISTRAR()
+
+private:
+    typedef SkMorphologyImageFilter INHERITED;
+};
+
+#endif
+
diff --git a/include/effects/SkTableColorFilter.h b/include/effects/SkTableColorFilter.h
index 0aefdbd..b442197 100644
--- a/include/effects/SkTableColorFilter.h
+++ b/include/effects/SkTableColorFilter.h
@@ -4,7 +4,7 @@
 
 #include "SkColorFilter.h"
 
-class SkTableColorFilter {
+class SK_API SkTableColorFilter {
 public:
     /**
      *  Create a table colorfilter, copying the table into the filter, and
diff --git a/include/effects/SkTestImageFilters.h b/include/effects/SkTestImageFilters.h
index db020ad..55522c1 100755
--- a/include/effects/SkTestImageFilters.h
+++ b/include/effects/SkTestImageFilters.h
@@ -3,6 +3,7 @@
 #define _SkTestImageFilters_h
 
 #include "SkImageFilter.h"
+#include "SkColorFilter.h"
 
 class SkOffsetImageFilter : public SkImageFilter {
 public:
@@ -100,8 +101,6 @@
     typedef SkImageFilter INHERITED;
 };
 
-class SkColorFilter;
-
 class SkColorFilterImageFilter : public SkImageFilter {
 public:
     SkColorFilterImageFilter(SkColorFilter* cf) : fColorFilter(cf) {
diff --git a/include/gpu/GrConfig.h b/include/gpu/GrConfig.h
index 72b9748..1dfe199 100644
--- a/include/gpu/GrConfig.h
+++ b/include/gpu/GrConfig.h
@@ -364,23 +364,6 @@
     #define GR_GEOM_BUFFER_LOCK_THRESHOLD (1 << 15)
 #endif
 
-/**
- * Enables/disables use of offscreen AA
- */
-#if !defined(GR_USE_OFFSCREEN_AA)
-    #define GR_USE_OFFSCREEN_AA 1
-#endif
-
-/**
- * GR_MAX_OFFSCREEN_AA_SIZE controls the size at which offscreen AA will tile.
- * Tiling saves GPU memory by limiting the size of the offscreen buffer. The
- * max offscreen may be as large as (4*GR_MAX_OFFSCREEN_AA_SIZE)^2 pixels.
- */
-#if !defined(GR_MAX_OFFSCREEN_AA_SIZE)
-    #define GR_MAX_OFFSCREEN_AA_SIZE    256
-#endif
-
-
 ///////////////////////////////////////////////////////////////////////////////
 // tail section:
 //
diff --git a/include/gpu/GrContext.h b/include/gpu/GrContext.h
index 0308b5d..37160b1 100644
--- a/include/gpu/GrContext.h
+++ b/include/gpu/GrContext.h
@@ -75,6 +75,11 @@
      */
     void freeGpuResources();
 
+    /**
+     * Returns the number of bytes of GPU memory hosted by the texture cache.
+     */
+    size_t getGpuTextureCacheBytes() const;
+
     ///////////////////////////////////////////////////////////////////////////
     // Textures
 
@@ -82,7 +87,7 @@
      * Token that refers to an entry in the texture cache. Returned by
      * functions that lock textures. Passed to unlockTexture.
      */
-    class TextureCacheEntry {
+    class SK_API TextureCacheEntry {
     public:
         TextureCacheEntry() : fEntry(NULL) {}
         TextureCacheEntry(const TextureCacheEntry& e) : fEntry(e.fEntry) {}
@@ -295,26 +300,6 @@
      GrRenderTarget* createPlatformRenderTarget(
                                     const GrPlatformRenderTargetDesc& desc);
 
-    /**
-     * This interface is depracted and will be removed in a future revision.
-     * Callers should use createPlatformTexture or createPlatformRenderTarget
-     * instead.
-     *
-     * Wraps an existing 3D API surface in a GrObject. desc.fFlags determines
-     * the type of object returned. If kIsTexture is set the returned object
-     * will be a GrTexture*. Otherwise, it will be a GrRenderTarget*. If both 
-     * are set the render target object is accessible by
-     * GrTexture::asRenderTarget().
-     *
-     * GL: if the object is a texture Gr may change its GL texture parameters
-     *     when it is drawn.
-     *
-     * @param   desc    description of the object to create.
-     * @return either a GrTexture* or GrRenderTarget* depending on desc. NULL
-     *         on failure.
-     */
-    GrResource* createPlatformSurface(const GrPlatformSurfaceDesc& desc);
-
     ///////////////////////////////////////////////////////////////////////////
     // Matrix state
 
@@ -577,31 +562,48 @@
      * @param dst           the render target to copy to.
      */
     void copyTexture(GrTexture* src, GrRenderTarget* dst);
+
     /**
-     * Applies a 1D convolution kernel in the X direction to a rectangle of
+     * Resolves a render target that has MSAA. The intermediate MSAA buffer is
+     * downsampled to the associated GrTexture (accessible via
+     * GrRenderTarget::asTexture()). Any pending draws to the render target will
+     * be executed before the resolve.
+     *
+     * This is only necessary when a client wants to access the object directly
+     * using the underlying graphics API. GrContext will detect when it must
+     * perform a resolve to a GrTexture used as the source of a draw or before
+     * reading pixels back from a GrTexture or GrRenderTarget.
+     */
+    void resolveRenderTarget(GrRenderTarget* target);
+
+    /**
+     * Applies a 1D convolution kernel in the given direction to a rectangle of
      * pixels from a given texture.
      * @param texture         the texture to read from
      * @param rect            the destination rectangle
      * @param kernel          the convolution kernel (kernelWidth elements)
      * @param kernelWidth     the width of the convolution kernel
+     * @param direction       the direction in which to apply the kernel
      */
-    void convolveInX(GrTexture* texture,
-                     const SkRect& rect,
-                     const float* kernel,
-                     int kernelWidth);
+    void convolve(GrTexture* texture,
+                  const SkRect& rect,
+                  const float* kernel,
+                  int kernelWidth,
+                  GrSamplerState::FilterDirection direction);
     /**
-     * Applies a 1D convolution kernel in the Y direction to a rectangle of
+     * Applies a 1D morphology in the given direction to a rectangle of
      * pixels from a given texture.
-     * direction.
      * @param texture         the texture to read from
      * @param rect            the destination rectangle
-     * @param kernel          the convolution kernel (kernelWidth elements)
-     * @param kernelWidth     the width of the convolution kernel
+     * @param radius          the radius of the morphological operator
+     * @param filter          the filter kernel (must be kDilate or kErode)
+     * @param direction       the direction in which to apply the morphology
      */
-    void convolveInY(GrTexture* texture,
-                     const SkRect& rect,
-                     const float* kernel,
-                     int kernelWidth);
+    void applyMorphology(GrTexture* texture,
+                         const SkRect& rect,
+                         int radius,
+                         GrSamplerState::Filter filter,
+                         GrSamplerState::FilterDirection direction);
     ///////////////////////////////////////////////////////////////////////////
     // Helpers
 
@@ -670,7 +672,6 @@
 
     GrIndexBuffer*              fAAFillRectIndexBuffer;
     GrIndexBuffer*              fAAStrokeRectIndexBuffer;
-    int                         fMaxOffscreenAASize;
 
     GrContext(GrGpu* gpu);
 
@@ -699,47 +700,9 @@
 
     GrPathRenderer* getPathRenderer(const GrPath& path,
                                     GrPathFill fill,
+                                    const GrDrawTarget* target,
                                     bool antiAlias);
 
-    struct OffscreenRecord;
-
-    // determines whether offscreen AA should be applied
-    bool doOffscreenAA(GrDrawTarget* target,
-                       bool isHairLines) const;
-
-    // attempts to setup offscreen AA. All paint state must be transferred to
-    // target by the time this is called.
-    bool prepareForOffscreenAA(GrDrawTarget* target,
-                               bool requireStencil,
-                               const GrIRect& boundRect,
-                               GrPathRenderer* pr,
-                               OffscreenRecord* record);
-
-    // sets up target to draw coverage to the supersampled render target
-    void setupOffscreenAAPass1(GrDrawTarget* target,
-                               const GrIRect& boundRect,
-                               int tileX, int tileY,
-                               OffscreenRecord* record);
-
-    // sets up target to sample coverage of supersampled render target back
-    // to the main render target using stage kOffscreenStage.
-    void doOffscreenAAPass2(GrDrawTarget* target,
-                            const GrPaint& paint,
-                            const GrIRect& boundRect,
-                            int tileX, int tileY,
-                            OffscreenRecord* record);
-
-    // restored the draw target state and releases offscreen target to cache
-    void cleanupOffscreenAA(GrDrawTarget* target,
-                            GrPathRenderer* pr,
-                            OffscreenRecord* record);
-
-    void convolve(GrTexture* texture,
-                  const SkRect& rect,
-                  float imageIncrement[2],
-                  const float* kernel,
-                  int kernelWidth);
-
     /**
      * Flags to the internal read/write pixels funcs
      */
@@ -878,4 +841,3 @@
 };
 
 #endif
-
diff --git a/include/gpu/GrGLConfig_chrome.h b/include/gpu/GrGLConfig_chrome.h
deleted file mode 100644
index ee3c991..0000000
--- a/include/gpu/GrGLConfig_chrome.h
+++ /dev/null
@@ -1,30 +0,0 @@
-
-/*
- * Copyright 2011 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-#ifndef GrGLConfig_chrome_DEFINED
-#define GrGLConfig_chrome_DEFINED
-
-// glGetError() forces a sync with gpu process on chrome
-#define GR_GL_CHECK_ERROR_START         0
-
-// ANGLE creates a temp VB for vertex attributes not specified per-vertex.
-#define GR_GL_NO_CONSTANT_ATTRIBUTES    GR_WIN32_BUILD
-
-// For RGBA teximage/readpixels ANGLE will sw-convert to/from BGRA.
-#define GR_GL_RGBA_8888_PIXEL_OPS_SLOW  GR_WIN32_BUILD
-
-// ANGLE can go faster if the entire fbo is read rather than a subrect
-#define GR_GL_FULL_READPIXELS_FASTER_THAN_PARTIAL GR_WIN32_BUILD
-
-// cmd buffer allocates memory and memsets it to zero when it sees glBufferData
-// with NULL.
-#define GR_GL_USE_BUFFER_DATA_NULL_HINT 0
-
-// chrome uses this to set the context on each GL call.
-#define GR_GL_PER_GL_FUNC_CALLBACK      1
-
-#endif
diff --git a/include/gpu/GrPaint.h b/include/gpu/GrPaint.h
index f1d74b2..9f220e0 100644
--- a/include/gpu/GrPaint.h
+++ b/include/gpu/GrPaint.h
@@ -36,6 +36,7 @@
     bool                        fColorMatrixEnabled;
 
     GrColor                     fColor;
+    uint8_t                     fCoverage;
 
     GrColor                     fColorFilterColor;
     SkXfermode::Mode            fColorFilterXfermode;
@@ -126,6 +127,7 @@
         fDither = paint.fDither;
 
         fColor = paint.fColor;
+        fCoverage = paint.fCoverage;
 
         fColorFilterColor = paint.fColorFilterColor;
         fColorFilterXfermode = paint.fColorFilterXfermode;
@@ -161,6 +163,7 @@
         this->resetBlend();
         this->resetOptions();
         this->resetColor();
+        this->resetCoverage();
         this->resetTextures();
         this->resetColorFilter();
         this->resetMasks();
@@ -242,6 +245,10 @@
         fColor = GrColorPackRGBA(0xff, 0xff, 0xff, 0xff);
     }
 
+    void resetCoverage() {
+        fCoverage = 0xff;
+    }
+
     void resetTextures() {
         for (int i = 0; i < kMaxTextures; ++i) {
             this->setTexture(i, NULL);
diff --git a/include/gpu/GrRenderTarget.h b/include/gpu/GrRenderTarget.h
index 13b2160..909adb3 100644
--- a/include/gpu/GrRenderTarget.h
+++ b/include/gpu/GrRenderTarget.h
@@ -112,6 +112,14 @@
      */
     const GrIRect& getResolveRect() const { return fResolveRect; }
 
+    /**
+     * If the render target is multisampled this will perform a multisample
+     * resolve. Any pending draws to the target are first flushed. This only
+     * applies to render targets that are associated with GrTextures. After the
+     * function returns the GrTexture will contain the resolved pixels.
+     */
+    void resolve();
+
     // GrResource overrides
     virtual size_t sizeInBytes() const;
 
diff --git a/include/gpu/GrSamplerState.h b/include/gpu/GrSamplerState.h
index 81dfdb3..50a6cc9 100644
--- a/include/gpu/GrSamplerState.h
+++ b/include/gpu/GrSamplerState.h
@@ -39,6 +39,14 @@
          * Apply a separable convolution kernel.
          */
         kConvolution_Filter,
+        /**
+         * Apply a dilate filter (max over a 1D radius).
+         */
+        kDilate_Filter,
+        /**
+         * Apply an erode filter (min over a 1D radius).
+         */
+        kErode_Filter,
 
         kDefault_Filter = kNearest_Filter
     };
@@ -87,6 +95,17 @@
     };
 
     /**
+     * For the filters which perform more than one texture sample (convolution,
+     * erode, dilate), this determines the direction in which the texture
+     * coordinates will be incremented.
+     */
+    enum FilterDirection {
+        kX_FilterDirection,
+        kY_FilterDirection,
+
+        kDefault_FilterDirection = kX_FilterDirection,
+    };
+    /**
      * Default sampler state is set to clamp, use normal sampling mode, be
      * unfiltered, and use identity matrix.
      */
@@ -99,6 +118,7 @@
 
     WrapMode getWrapX() const { return fWrapX; }
     WrapMode getWrapY() const { return fWrapY; }
+    FilterDirection getFilterDirection() const { return fFilterDirection; }
     SampleMode getSampleMode() const { return fSampleMode; }
     const GrMatrix& getMatrix() const { return fMatrix; }
     const GrRect& getTextureDomain() const { return fTextureDomain; }
@@ -106,7 +126,6 @@
     Filter getFilter() const { return fFilter; }
     int getKernelWidth() const { return fKernelWidth; }
     const float* getKernel() const { return fKernel; }
-    const float* getImageIncrement() const { return fImageIncrement; }
     bool swapsRAndB() const { return fSwapRAndB; }
 
     bool isGradient() const {
@@ -118,6 +137,7 @@
     void setWrapX(WrapMode mode) { fWrapX = mode; }
     void setWrapY(WrapMode mode) { fWrapY = mode; }
     void setSampleMode(SampleMode mode) { fSampleMode = mode; }
+    void setFilterDirection(FilterDirection mode) { fFilterDirection = mode; }
     
     /**
      * Access the sampler's matrix. See SampleMode for explanation of
@@ -158,24 +178,29 @@
 
     void reset(WrapMode wrapXAndY,
                Filter filter,
+               FilterDirection direction,
                const GrMatrix& matrix) {
         fWrapX = wrapXAndY;
         fWrapY = wrapXAndY;
         fSampleMode = kDefault_SampleMode;
         fFilter = filter;
+        fFilterDirection = direction;
         fMatrix = matrix;
         fTextureDomain.setEmpty();
         fSwapRAndB = false;
     }
+    void reset(WrapMode wrapXAndY, Filter filter, const GrMatrix& matrix) {
+        this->reset(wrapXAndY, filter, kDefault_FilterDirection, matrix);
+    }
     void reset(WrapMode wrapXAndY,
                Filter filter) {
-        this->reset(wrapXAndY, filter, GrMatrix::I());
+        this->reset(wrapXAndY, filter, kDefault_FilterDirection, GrMatrix::I());
     }
     void reset(const GrMatrix& matrix) {
-        this->reset(kDefault_WrapMode, kDefault_Filter, matrix);
+        this->reset(kDefault_WrapMode, kDefault_Filter, kDefault_FilterDirection, matrix);
     }
     void reset() {
-        this->reset(kDefault_WrapMode, kDefault_Filter, GrMatrix::I());
+        this->reset(kDefault_WrapMode, kDefault_Filter, kDefault_FilterDirection, GrMatrix::I());
     }
 
     GrScalar getRadial2CenterX1() const { return fRadial2CenterX1; }
@@ -198,37 +223,37 @@
         fRadial2PosRoot = posRoot;
     }
 
-    void setConvolutionParams(int kernelWidth, const float* kernel, float imageIncrement[2]) {
+    void setConvolutionParams(int kernelWidth, const float* kernel) {
         GrAssert(kernelWidth >= 0 && kernelWidth <= MAX_KERNEL_WIDTH);
         fKernelWidth = kernelWidth;
         if (NULL != kernel) {
             memcpy(fKernel, kernel, kernelWidth * sizeof(float));
         }
-        if (NULL != imageIncrement) {
-            memcpy(fImageIncrement, imageIncrement, sizeof(fImageIncrement));
-        } else {
-            memset(fImageIncrement, 0, sizeof(fImageIncrement));
-        }
+    }
+
+    void setMorphologyRadius(int radius) {
+        GrAssert(radius >= 0 && radius <= MAX_KERNEL_WIDTH);
+        fKernelWidth = radius;
     }
 
 private:
-    WrapMode    fWrapX : 8;
-    WrapMode    fWrapY : 8;
-    SampleMode  fSampleMode : 8;
-    Filter      fFilter : 8;
-    GrMatrix    fMatrix;
-    bool        fSwapRAndB;
-    GrRect      fTextureDomain;
+    WrapMode            fWrapX : 8;
+    WrapMode            fWrapY : 8;
+    FilterDirection     fFilterDirection : 8;
+    SampleMode          fSampleMode : 8;
+    Filter              fFilter : 8;
+    GrMatrix            fMatrix;
+    bool                fSwapRAndB;
+    GrRect              fTextureDomain;
 
     // these are undefined unless fSampleMode == kRadial2_SampleMode
-    GrScalar    fRadial2CenterX1;
-    GrScalar    fRadial2Radius0;
-    SkBool8     fRadial2PosRoot;
+    GrScalar            fRadial2CenterX1;
+    GrScalar            fRadial2Radius0;
+    SkBool8             fRadial2PosRoot;
 
     // These are undefined unless fFilter == kConvolution_Filter
-    uint8_t     fKernelWidth;
-    float       fImageIncrement[2];
-    float       fKernel[MAX_KERNEL_WIDTH];
+    uint8_t             fKernelWidth;
+    float               fKernel[MAX_KERNEL_WIDTH];
 };
 
 #endif
diff --git a/include/gpu/GrTypes.h b/include/gpu/GrTypes.h
index 0bcab7d..f6809d6 100644
--- a/include/gpu/GrTypes.h
+++ b/include/gpu/GrTypes.h
@@ -301,6 +301,8 @@
      * Unpremultiplied. Byte order is b,g,r,a
      */
     kBGRA_8888_UPM_GrPixelConfig,
+
+    kGrPixelConfigCount
 };
 
 // Aliases for pixel configs that match skia's byte order
@@ -429,18 +431,10 @@
 }
 
 /**
-    * Used to control the level of antialiasing available for a rendertarget.
-    * Anti-alias quality levels depend on the underlying API/GPU capabilities.
-    */
-enum GrAALevels {
-    kNone_GrAALevel, //<! No antialiasing available.
-    kLow_GrAALevel,  //<! Low quality antialiased rendering. Actual
-                     //   interpretation is platform-dependent.
-    kMed_GrAALevel,  //<! Medium quality antialiased rendering. Actual
-                     //   interpretation is platform-dependent.
-    kHigh_GrAALevel, //<! High quality antialiased rendering. Actual
-                     //   interpretation is platform-dependent.
-};
+ * DEPRECATED: This will be removed as soon as WebKit no longer references
+ * this (former) enum value.
+ */
+static const int kNone_GrAALevel = 0;
 
 /**
  * Optional bitfield flags that can be passed to createTexture.
@@ -479,18 +473,31 @@
  */
 struct GrTextureDesc {
     GrTextureFlags         fFlags;  //!< bitfield of TextureFlags
-    /**
-     * The level of antialiasing available for a rendertarget texture. Only used
-     * fFlags contains kRenderTarget_GrTextureFlag.
-     */
-    GrAALevels             fAALevel;
     int                    fWidth;  //!< Width of the texture
     int                    fHeight; //!< Height of the texture
+
     /**
      * Format of source data of the texture. Not guaraunteed to be the same as
      * internal format used by 3D API.
      */
     GrPixelConfig          fConfig;
+    
+    /**
+     * The number of samples per pixel or 0 to disable full scene AA. This only
+     * applies if the kRenderTarget_GrTextureFlagBit is set. The actual number
+     * of samples may not exactly match the request. The request will be rounded
+     * up to the next supported sample count, or down if it is larger than the
+     * max supportex count.
+     */
+    union {
+        /**
+         * This field has two names for legacy reasons. Use the fSampleCnt name.
+         * fAALevel is deprecated and will be removed as soon as WebKit no
+         * longer uses it.
+         */
+        int fSampleCnt;
+        int fAALevel;
+    };
 };
 
 /**
@@ -593,24 +600,6 @@
     return gIsFillInverted[fill];
 }
 
-/**
- * Hints provided about a path's convexity (or lack thereof).
- */
-enum GrConvexHint {
-    kNone_ConvexHint,                         //<! No hint about convexity
-                                              //   of the path
-    kConvex_ConvexHint,                       //<! Path is one convex piece
-    kNonOverlappingConvexPieces_ConvexHint,   //<! Multiple convex pieces,
-                                              //   pieces are known to be
-                                              //   disjoint
-    kSameWindingConvexPieces_ConvexHint,      //<! Multiple convex pieces,
-                                              //   may or may not intersect,
-                                              //   either all wind cw or all
-                                              //   wind ccw.
-    kConcave_ConvexHint                       //<! Path is known to be
-                                              //   concave
-};
-
 ///////////////////////////////////////////////////////////////////////////////
 
 // opaque type for 3D API object handles
@@ -702,133 +691,6 @@
     GrPlatform3DObject              fRenderTargetHandle;
 };
 
-///////////////////////////////////////////////////////////////////////////////
-// DEPRECATED. createPlatformSurface is replaced by createPlatformTexture
-// and createPlatformRenderTarget. These enums and structs will be removed.
-
-enum GrPlatformSurfaceType {
-    /**
-     * Specifies that the object being created is a render target.
-     */
-    kRenderTarget_GrPlatformSurfaceType,
-    /**
-     * Specifies that the object being created is a texture.
-     */
-    kTexture_GrPlatformSurfaceType,
-    /**
-     * Specifies that the object being created is a texture and a render
-     * target.
-     */
-    kTextureRenderTarget_GrPlatformSurfaceType,
-};
-
-enum GrPlatformRenderTargetFlags {
-    kNone_GrPlatformRenderTargetFlagBit             = 0x0,
-
-    /**
-     * Gives permission to Gr to perform the downsample-resolve of a
-     * multisampled render target. If this is not set then read pixel
-     * operations may fail. If the object is both a texture and render target
-     * then this *must* be set. Otherwise, if the client wants do its own
-     * resolves it must create separate GrRenderTarget and GrTexture objects
-     * and insert appropriate flushes and resolves betweeen data hazards.
-     * GrRenderTarget has a flagForResolve()
-     */
-    kGrCanResolve_GrPlatformRenderTargetFlagBit     = 0x2,
-};
-
-GR_MAKE_BITFIELD_OPS(GrPlatformRenderTargetFlags)
-
-struct GrPlatformSurfaceDesc {
-    GrPlatformSurfaceType           fSurfaceType;   // type of surface to create
-    /**
-     * Flags for kRenderTarget and kTextureRenderTarget surface types
-     */
-    GrPlatformRenderTargetFlags     fRenderTargetFlags;
-
-    int                             fWidth;         // width in pixels
-    int                             fHeight;        // height in pixels
-    GrPixelConfig                   fConfig;        // color format
-    /**
-     * Number of per sample stencil buffer. Only relevant if kIsRenderTarget is
-     * set in fFlags.
-     */
-    int                             fStencilBits;
-
-    /**
-     * Number of samples per-pixel. Only relevant if kIsRenderTarget is set in
-     * fFlags.
-     */
-    int                             fSampleCnt;
-
-    /**
-     * Texture object in 3D API. Only relevant if fSurfaceType is kTexture or
-     * kTextureRenderTarget.
-     * GL: this is a texture object (glGenTextures)
-     */
-    GrPlatform3DObject              fPlatformTexture;
-    /**
-     * Render target object in 3D API. Only relevant if fSurfaceType is
-     * kRenderTarget or kTextureRenderTarget
-     * GL: this is a FBO object (glGenFramebuffers)
-     */
-    GrPlatform3DObject              fPlatformRenderTarget;
-    /**
-     * 3D API object used as destination of resolve. Only relevant if
-     * fSurfaceType is kRenderTarget or kTextureRenderTarget and
-     * kGrCanResolve is set in fRenderTargetFlags.
-     * fFlags.
-     * GL: this is a FBO object (glGenFramebuffers)
-     */
-    GrPlatform3DObject              fPlatformResolveDestination;
-
-    void reset() { memset(this, 0, sizeof(GrPlatformSurfaceDesc)); }
-};
-
-/**
- * Example of how to wrap render-to-texture-with-MSAA GL objects with a GrPlatformSurace
- *
- * GLint colorBufferID;
- * glGenRenderbuffers(1, &colorID);
- * glBindRenderbuffer(GL_RENDERBUFFER, colorBufferID);
- * glRenderbufferStorageMultisample(GL_RENDERBUFFER, S, GL_RGBA, W, H);
- *
- * GLint stencilBufferID;
- * glGenRenderBuffers(1, &stencilBufferID);
- * glBindRenderbuffer(GL_RENDERBUFFER, stencilBufferID);
- * glRenderbufferStorageMultisample(GL_RENDERBUFFER, S, GL_STENCIL_INDEX8, W, H);
- *
- * GLint drawFBOID;
- * glGenFramebuffers(1, &drawFBOID);
- * glBindFramebuffer(GL_FRAMEBUFFER, drawFBOID);
- * glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, colorBufferID);
- * glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_RENDERBUFFER, stencilBufferID);
- *
- * GLint textureID;
- * glGenTextures(1, &textureID);
- * glBindTexture(GL_TEXTURE_2D, textureID);
- * glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, W, H, ...);
- *
- * GLint readFBOID;
- * glGenFramebuffers(1, &readFBOID);
- * glBindFramebuffer(GL_FRAMEBUFFER, readFBOID);
- * glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, textureID, 0);
- *
- * GrPlatformSurfaceDesc renderTargetTextureDesc;
- * renderTargetTextureDesc.fSurfaceType       = kTextureRenderTarget_GrPlatformSurfaceType;
- * renderTargetTextureDesc.fRenderTargetFlags = kGrCanResolve_GrPlatformRenderTargetFlagBit;
- * renderTargetTextureDesc.fWidth = W;
- * renderTargetTextureDesc.fHeight = H;
- * renderTargetTextureDesc.fConfig = kSkia8888_PM_GrPixelConfig
- * renderTargetTextureDesc.fStencilBits = 8;
- * renderTargetTextureDesc.fSampleCnt = S;
- * renderTargetTextureDesc.fPlatformTexture = textureID;
- * renderTargetTextureDesc.fPlatformRenderTarget = drawFBOID;
- * renderTargetTextureDesc.fPlatformResolveDestination = readFBOID;
- *
- * GrTexture* texture = static_cast<GrTexture*>(grContext->createPlatrformSurface(renderTargetTextureDesc));
- */
-
 
 ///////////////////////////////////////////////////////////////////////////////
 
diff --git a/include/gpu/SkGpuDevice.h b/include/gpu/SkGpuDevice.h
index 6c4285e..6663ed5 100644
--- a/include/gpu/SkGpuDevice.h
+++ b/include/gpu/SkGpuDevice.h
@@ -35,7 +35,7 @@
      *  internally.
      */
     SkGpuDevice(GrContext*, SkBitmap::Config,
-                int width, int height, 
+                int width, int height,
                 SkDevice::Usage usage = SkDevice::kGeneral_Usage);
 
     /**
@@ -102,7 +102,7 @@
                             const SkPaint&) SK_OVERRIDE;
     virtual bool filterTextFlags(const SkPaint&, TextFlags*) SK_OVERRIDE;
 
-    virtual void flush(); 
+    virtual void flush();
 
     /**
      * Make's this device's rendertarget current in the underlying 3D API.
@@ -113,7 +113,7 @@
     virtual bool filterImage(SkImageFilter*, const SkBitmap& src,
                              const SkMatrix& ctm,
                              SkBitmap* result, SkIPoint* offset) SK_OVERRIDE;
-    
+
 protected:
     typedef GrContext::TextureCacheEntry TexCache;
     enum TexType {
@@ -144,7 +144,7 @@
         TexCache        fTex;
     };
     friend class SkAutoTexCache;
-    
+
     // overrides from SkDevice
     virtual bool onReadPixels(const SkBitmap& bitmap,
                               int x, int y,
@@ -170,8 +170,8 @@
     // caller needs to null out GrPaint's texture if
     // non-textured drawing is desired.
     // Set constantColor to true if a constant color
-    // will be used.  This is an optimization, and can 
-    // always be set to false. constantColor should 
+    // will be used.  This is an optimization, and can
+    // always be set to false. constantColor should
     // never be true if justAlpha is true.
     bool skPaint2GrPaintNoShader(const SkPaint& skPaint,
                                  bool justAlpha,
@@ -191,8 +191,8 @@
                                bool constantColor);
 
     // override from SkDevice
-    virtual SkDevice* onCreateCompatibleDevice(SkBitmap::Config config, 
-                                               int width, int height, 
+    virtual SkDevice* onCreateCompatibleDevice(SkBitmap::Config config,
+                                               int width, int height,
                                                bool isOpaque,
                                                Usage usage);
 
diff --git a/include/gpu/SkGrTexturePixelRef.h b/include/gpu/SkGrTexturePixelRef.h
index 720f130..ab92eff 100644
--- a/include/gpu/SkGrTexturePixelRef.h
+++ b/include/gpu/SkGrTexturePixelRef.h
@@ -21,7 +21,7 @@
  *  Common baseclass that implements onLockPixels() by calling onReadPixels().
  *  Since it has a copy, it always returns false for onLockPixelsAreWritable().
  */
-class SkROLockPixelsPixelRef : public SkPixelRef {
+class SK_API SkROLockPixelsPixelRef : public SkPixelRef {
 public:
     SkROLockPixelsPixelRef();
     virtual ~SkROLockPixelsPixelRef();
@@ -40,7 +40,7 @@
 /**
  *  PixelRef that wraps a GrTexture
  */
-class SkGrTexturePixelRef : public SkROLockPixelsPixelRef {
+class SK_API SkGrTexturePixelRef : public SkROLockPixelsPixelRef {
 public:
             SkGrTexturePixelRef(GrTexture*);
     virtual ~SkGrTexturePixelRef();
@@ -63,7 +63,7 @@
 /**
  *  PixelRef that wraps a GrRenderTarget
  */
-class SkGrRenderTargetPixelRef : public SkROLockPixelsPixelRef {
+class SK_API SkGrRenderTargetPixelRef : public SkROLockPixelsPixelRef {
 public:
             SkGrRenderTargetPixelRef(GrRenderTarget* rt);
     virtual ~SkGrRenderTargetPixelRef();
diff --git a/include/gpu/GrGLConfig.h b/include/gpu/gl/GrGLConfig.h
similarity index 74%
rename from include/gpu/GrGLConfig.h
rename to include/gpu/gl/GrGLConfig.h
index c9aaec5..806f055 100644
--- a/include/gpu/GrGLConfig.h
+++ b/include/gpu/gl/GrGLConfig.h
@@ -86,48 +86,98 @@
  * glReadPixels to read the entire framebuffer is faster than calling it with
  * the same sized rectangle but with a framebuffer bound that is larger than
  * the rectangle read.
+ *
+ * GR_GL_CHECK_ALLOC_WITH_GET_ERROR: If set to 1 this will then glTexImage,
+ * glBufferData, glRenderbufferStorage, etc will be checked for errors. This
+ * amounts to ensuring the error is GL_NO_ERROR, calling the allocating
+ * function, and then checking that the error is still GL_NO_ERROR. When the
+ * value is 0 we will assume no error was generated without checking.
+ *
+ * GR_GL_CHECK_FBO_STATUS_ONCE_PER_FORMAT: We will normally check the FBO status
+ * every time we bind a texture or renderbuffer to an FBO. However, in some
+ * environments CheckFrameBufferStatus is very expensive. If this is set we will
+ * check the first time we use a color format or a combination of color /
+ * stencil formats as attachments. If the FBO is complete we will assume
+ * subsequent attachments with the same formats are complete as well.
  */
 
 #if !defined(GR_GL_LOG_CALLS)
-    #define GR_GL_LOG_CALLS                     GR_DEBUG
+    #define GR_GL_LOG_CALLS                             GR_DEBUG
 #endif
 
 #if !defined(GR_GL_LOG_CALLS_START)
-    #define GR_GL_LOG_CALLS_START               0
+    #define GR_GL_LOG_CALLS_START                       0
 #endif
 
 #if !defined(GR_GL_CHECK_ERROR)
-    #define GR_GL_CHECK_ERROR                   GR_DEBUG
+    #define GR_GL_CHECK_ERROR                           GR_DEBUG
 #endif
 
 #if !defined(GR_GL_CHECK_ERROR_START)
-    #define GR_GL_CHECK_ERROR_START             1
+    #define GR_GL_CHECK_ERROR_START                     1
 #endif
 
 #if !defined(GR_GL_NO_CONSTANT_ATTRIBUTES)
-    #define GR_GL_NO_CONSTANT_ATTRIBUTES        0
+    #define GR_GL_NO_CONSTANT_ATTRIBUTES                0
 #endif
 
 #if !defined(GR_GL_ATTRIBUTE_MATRICES)
-    #define GR_GL_ATTRIBUTE_MATRICES            0
+    #define GR_GL_ATTRIBUTE_MATRICES                    0
 #endif
 
 #if !defined(GR_GL_USE_BUFFER_DATA_NULL_HINT)
-    #define GR_GL_USE_BUFFER_DATA_NULL_HINT     1
+    #define GR_GL_USE_BUFFER_DATA_NULL_HINT             1
 #endif
 
 #if !defined(GR_GL_PER_GL_FUNC_CALLBACK)
-    #define GR_GL_PER_GL_FUNC_CALLBACK          0
+    #define GR_GL_PER_GL_FUNC_CALLBACK                  0
 #endif
 
 #if !defined(GR_GL_RGBA_8888_PIXEL_OPS_SLOW)
-    #define GR_GL_RGBA_8888_PIXEL_OPS_SLOW      0
+    #define GR_GL_RGBA_8888_PIXEL_OPS_SLOW              0
 #endif
 
 #if !defined(GR_GL_FULL_READPIXELS_FASTER_THAN_PARTIAL)
-    #define GR_GL_FULL_READPIXELS_FASTER_THAN_PARTIAL 0
+    #define GR_GL_FULL_READPIXELS_FASTER_THAN_PARTIAL   0
 #endif
 
+#if !defined(GR_GL_CHECK_ALLOC_WITH_GET_ERROR)
+    #define GR_GL_CHECK_ALLOC_WITH_GET_ERROR            1
+#endif
+
+#if !defined(GR_GL_CHECK_FBO_STATUS_ONCE_PER_FORMAT)
+    #define GR_GL_CHECK_FBO_STATUS_ONCE_PER_FORMAT      0
+#endif
+
+/**
+ * There is a strange bug that occurs on Macs with NVIDIA GPUs. We don't
+ * fully understand it. When (element) array buffers are continually
+ * respecified using glBufferData performance can fall off of a cliff. The
+ * driver winds up performing many DMA mapping / unmappings and chews up ~50% of
+ * the core. However, it has been observed that occaisonally respecifiying the
+ * buffer using glBufferData and then writing data using glBufferSubData
+ * prevents the bad behavior.
+ *
+ * There is a lot of uncertainty around this issue. In Chrome backgrounding
+ * the tab somehow initiates this behavior and we don't know what the connection
+ * is. Another observation is that Chrome's cmd buffer server will actually
+ * create a buffer full of zeros when it sees a NULL data param (for security
+ * reasons). If this is disabled and NULL is actually passed all the way to the
+ * driver then the workaround doesn't help.
+ *
+ * The issue is tracked at:
+ * http://code.google.com/p/chromium/issues/detail?id=114865
+ *
+ * When the workaround is enabled we will use the glBufferData / glBufferSubData
+ * trick every 128 array buffer uploads.
+ *
+ * Hopefully we will understand this better and have a cleaner fix or get a
+ * OS/driver level fix.
+ */
+#define GR_GL_MAC_BUFFER_OBJECT_PERFOMANCE_WORKAROUND   \
+    (GR_MAC_BUILD &&                                    \
+     !GR_GL_USE_BUFFER_DATA_NULL_HINT)
+
 #if(GR_GL_NO_CONSTANT_ATTRIBUTES) && (GR_GL_ATTRIBUTE_MATRICES)
     #error "Cannot combine GR_GL_NO_CONSTANT_ATTRIBUTES and GR_GL_ATTRIBUTE_MATRICES"
 #endif
diff --git a/include/gpu/gl/GrGLConfig_chrome.h b/include/gpu/gl/GrGLConfig_chrome.h
new file mode 100644
index 0000000..50ea34c
--- /dev/null
+++ b/include/gpu/gl/GrGLConfig_chrome.h
@@ -0,0 +1,37 @@
+
+/*
+ * Copyright 2011 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+#ifndef GrGLConfig_chrome_DEFINED
+#define GrGLConfig_chrome_DEFINED
+
+// glGetError() forces a sync with gpu process on chrome
+#define GR_GL_CHECK_ERROR_START                     0
+
+// ANGLE creates a temp VB for vertex attributes not specified per-vertex.
+#define GR_GL_NO_CONSTANT_ATTRIBUTES                GR_WIN32_BUILD
+
+// For RGBA teximage/readpixels ANGLE will sw-convert to/from BGRA.
+#define GR_GL_RGBA_8888_PIXEL_OPS_SLOW              GR_WIN32_BUILD
+
+// ANGLE can go faster if the entire fbo is read rather than a subrect
+#define GR_GL_FULL_READPIXELS_FASTER_THAN_PARTIAL   GR_WIN32_BUILD
+
+// cmd buffer allocates memory and memsets it to zero when it sees glBufferData
+// with NULL.
+#define GR_GL_USE_BUFFER_DATA_NULL_HINT             0
+
+// chrome uses this to set the context on each GL call.
+#define GR_GL_PER_GL_FUNC_CALLBACK                  1
+
+// Check error is even more expensive in chrome (cmd buffer flush). The
+// compositor also doesn't check its allocations.
+#define GR_GL_CHECK_ALLOC_WITH_GET_ERROR            0
+
+// CheckFramebufferStatus in chrome synchronizes the gpu and renderer processes.
+#define GR_GL_CHECK_FBO_STATUS_ONCE_PER_FORMAT      1
+
+#endif
diff --git a/include/gpu/GrGLDefines.h b/include/gpu/gl/GrGLDefines.h
similarity index 100%
rename from include/gpu/GrGLDefines.h
rename to include/gpu/gl/GrGLDefines.h
diff --git a/include/gpu/GrGLInterface.h b/include/gpu/gl/GrGLInterface.h
similarity index 96%
rename from include/gpu/GrGLInterface.h
rename to include/gpu/gl/GrGLInterface.h
index 716cff9..968afab 100644
--- a/include/gpu/GrGLInterface.h
+++ b/include/gpu/gl/GrGLInterface.h
@@ -21,6 +21,23 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 /**
+ * Classifies GL contexts (currently as Desktop vs. ES2). This is a bitfield.
+ * A GrGLInterface (defined below) may support multiple bindings.
+ */
+enum GrGLBinding {
+    kNone_GrGLBinding = 0x0,
+
+    kDesktop_GrGLBinding = 0x01,
+    kES2_GrGLBinding = 0x02,
+
+    // for iteration of GrGLBindings
+    kFirstGrGLBinding = kDesktop_GrGLBinding,
+    kLastGrGLBinding = kES2_GrGLBinding
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/**
  * Helpers for glGetString()
  */
 
@@ -34,11 +51,13 @@
 
 // these variants assume caller already has a string from glGetString()
 GrGLVersion GrGLGetVersionFromString(const char* versionString);
+GrGLBinding GrGLGetBindingInUseFromString(const char* versionString);
 GrGLSLVersion GrGLGetGLSLVersionFromString(const char* versionString);
 bool GrGLHasExtensionFromString(const char* ext, const char* extensionString);
 
 // these variants call glGetString()
 bool GrGLHasExtension(const GrGLInterface*, const char* ext);
+GrGLBinding GrGLGetBindingInUse(const GrGLInterface*);
 GrGLVersion GrGLGetVersion(const GrGLInterface*);
 GrGLSLVersion GrGLGetGLSLVersion(const GrGLInterface*);
 
@@ -109,11 +128,6 @@
 typedef long GrGLintptr;
 typedef long GrGLsizeiptr;
 
-enum GrGLBinding {
-    kDesktop_GrGLBinding = 0x01,
-    kES2_GrGLBinding = 0x02
-};
-
 extern "C" {
     typedef GrGLvoid (GR_GL_FUNCTION_TYPE *GrGLActiveTextureProc)(GrGLenum texture);
     typedef GrGLvoid (GR_GL_FUNCTION_TYPE *GrGLAttachShaderProc)(GrGLuint program, GrGLuint shader);
@@ -267,13 +281,10 @@
 
     GrGLInterface();
 
-    bool validate() const;
-    bool supportsDesktop() const {
-        return 0 != (kDesktop_GrGLBinding & fBindingsExported);
-    }
-    bool supportsES2() const {
-        return 0 !=  (kES2_GrGLBinding & fBindingsExported);
-    }
+    // Validates that the GrGLInterface supports a binding. This means that
+    // the GrGLinterface advertises the binding in fBindingsExported and all
+    // the necessary function pointers have been initialized.
+    bool validate(GrGLBinding binding) const;
 
     // Indicator variable specifying the type of GL implementation
     // exported:  GLES{1|2} or Desktop.
diff --git a/include/gpu/SkGLContext.h b/include/gpu/gl/SkGLContext.h
similarity index 92%
rename from include/gpu/SkGLContext.h
rename to include/gpu/gl/SkGLContext.h
index f92a770..542d1bb 100644
--- a/include/gpu/SkGLContext.h
+++ b/include/gpu/gl/SkGLContext.h
@@ -9,6 +9,7 @@
 #define SkGLContext_DEFINED
 
 #include "GrGLInterface.h"
+#include "SkString.h"
 
 /**
  * Create an offscreen opengl context with an RGBA8 / 8bit stencil FBO.
@@ -31,6 +32,8 @@
 
     virtual void makeCurrent() const = 0;
 
+    bool hasExtension(const char* extensionName) const;
+
 protected:
     /**
      * Subclass implements this to make a GL context. The returned GrGLInterface 
@@ -46,6 +49,7 @@
     virtual void destroyGLContext() = 0;
 
 private:
+    SkString fExtensionString;
     GrGLuint fFBO;
     const GrGLInterface* fGL;
 };
diff --git a/include/gpu/SkMesaGLContext.h b/include/gpu/gl/SkMesaGLContext.h
similarity index 100%
rename from include/gpu/SkMesaGLContext.h
rename to include/gpu/gl/SkMesaGLContext.h
diff --git a/include/gpu/SkNativeGLContext.h b/include/gpu/gl/SkNativeGLContext.h
similarity index 100%
rename from include/gpu/SkNativeGLContext.h
rename to include/gpu/gl/SkNativeGLContext.h
diff --git a/include/gpu/SkNullGLContext.h b/include/gpu/gl/SkNullGLContext.h
similarity index 100%
rename from include/gpu/SkNullGLContext.h
rename to include/gpu/gl/SkNullGLContext.h
diff --git a/include/pdf/SkPDFDevice.h b/include/pdf/SkPDFDevice.h
index 4551149..9d6b54c 100644
--- a/include/pdf/SkPDFDevice.h
+++ b/include/pdf/SkPDFDevice.h
@@ -156,7 +156,7 @@
     const SkPDFGlyphSetMap& getFontGlyphUsage() const {
         return *(fFontGlyphUsage.get());
     }
-    
+
 protected:
     virtual bool onReadPixels(const SkBitmap& bitmap, int x, int y,
                               SkCanvas::Config8888) SK_OVERRIDE;
diff --git a/include/pdf/SkPDFFont.h b/include/pdf/SkPDFFont.h
index b884017..2ebdec7 100644
--- a/include/pdf/SkPDFFont.h
+++ b/include/pdf/SkPDFFont.h
@@ -196,7 +196,7 @@
 
     // This should be made a hash table if performance is a problem.
     static SkTDArray<FontRec>& CanonicalFonts();
-    static SkMutex& CanonicalFontsMutex();
+    static SkBaseMutex& CanonicalFontsMutex();
 };
 
 #endif
diff --git a/include/pdf/SkPDFGraphicState.h b/include/pdf/SkPDFGraphicState.h
index 9420405..af01737 100644
--- a/include/pdf/SkPDFGraphicState.h
+++ b/include/pdf/SkPDFGraphicState.h
@@ -86,7 +86,7 @@
 
     // This should be made a hash table if performance is a problem.
     static SkTDArray<GSCanonicalEntry>& CanonicalPaints();
-    static SkMutex& CanonicalPaintsMutex();
+    static SkBaseMutex& CanonicalPaintsMutex();
 
     SkPDFGraphicState();
     explicit SkPDFGraphicState(const SkPaint& paint);
diff --git a/include/pdf/SkPDFShader.h b/include/pdf/SkPDFShader.h
index 6b6ae03..439d83b 100644
--- a/include/pdf/SkPDFShader.h
+++ b/include/pdf/SkPDFShader.h
@@ -56,7 +56,7 @@
     };
     // This should be made a hash table if performance is a problem.
     static SkTDArray<ShaderCanonicalEntry>& CanonicalShaders();
-    static SkMutex& CanonicalShadersMutex();
+    static SkBaseMutex& CanonicalShadersMutex();
     static void RemoveShader(SkPDFObject* shader);
 
     SkPDFShader();
diff --git a/include/utils/SkDeferredCanvas.h b/include/utils/SkDeferredCanvas.h
new file mode 100644
index 0000000..87797ac
--- /dev/null
+++ b/include/utils/SkDeferredCanvas.h
@@ -0,0 +1,302 @@
+/*
+ * Copyright 2011 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkDeferredCanvas_DEFINED
+#define SkDeferredCanvas_DEFINED
+
+#include "SkCanvas.h"
+#include "SkDevice.h"
+#include "SkPicture.h"
+#include "SkPixelRef.h"
+
+/** \class SkDeferredCanvas
+    Subclass of SkCanvas that encapsulates an SkPicture for deferred drawing.
+    The main difference between this class and SkPictureRecord (the canvas
+    provided by SkPicture) is that this is a full drop-in replacement for
+    SkCanvas, while SkPictureRecord only supports draw operations.
+    SkDeferredCanvas will transparently trigger the flushing of deferred
+    draw operations when an attempt is made to access the pixel data.
+*/
+class SK_API SkDeferredCanvas : public SkCanvas {
+public:
+    class DeviceContext;
+
+    SkDeferredCanvas();
+
+    /** Construct a canvas with the specified device to draw into.
+        Equivalent to calling default constructor, then setDevice.
+        @param device Specifies a device for the canvas to draw into.
+    */
+    explicit SkDeferredCanvas(SkDevice* device);
+
+    /** Construct a canvas with the specified device to draw into, and
+     *  a device context. Equivalent to calling default constructor, then
+     *  setDevice.
+     *  @param device Specifies a device for the canvas to draw into.
+     *  @param deviceContext interface for the device's the graphics context
+     */
+    explicit SkDeferredCanvas(SkDevice* device, DeviceContext* deviceContext);
+
+    virtual ~SkDeferredCanvas();
+
+    /**
+     *  Specify a device to be used by this canvas. Calling setDevice will
+     *  release the previously set device, if any.
+     *
+     *  @param device The device that the canvas will raw into
+     *  @return The device argument, for convenience.
+     */
+    virtual SkDevice* setDevice(SkDevice* device);
+
+    /**
+     *  Specify a deviceContext to be used by this canvas. Calling
+     *  setDeviceContext will release the previously set deviceContext, if any.
+     *  A deviceContext must be specified if the device uses a graphics context
+     *  that requires some form of state initialization prior to drawing
+     *  and/or explicit flushing to synchronize the execution of rendering
+     *  operations.
+     *  Note: Must be called after the device is set with setDevice.
+     *
+     *  @deviceContext interface for the device's the graphics context
+     *  @return The deviceContext argument, for convenience.
+     */
+    DeviceContext* setDeviceContext(DeviceContext* deviceContext);
+
+    /**
+     *  Enable or disable deferred drawing. When deferral is disabled,
+     *  pending draw operations are immediately flushed and from then on,
+     *  the SkDeferredCanvas behaves just like a regular SkCanvas.
+     *  This method must not be called while the save/restore stack is in use.
+     *  @param deferred true/false
+     */
+    void setDeferredDrawing(bool deferred);
+
+    // Overrides of the SkCanvas interface
+    virtual int save(SaveFlags flags) SK_OVERRIDE;
+    virtual int saveLayer(const SkRect* bounds, const SkPaint* paint,
+                          SaveFlags flags) SK_OVERRIDE;
+    virtual void restore() SK_OVERRIDE;
+    virtual bool isDrawingToLayer() const SK_OVERRIDE;
+    virtual bool translate(SkScalar dx, SkScalar dy) SK_OVERRIDE;
+    virtual bool scale(SkScalar sx, SkScalar sy) SK_OVERRIDE;
+    virtual bool rotate(SkScalar degrees) SK_OVERRIDE;
+    virtual bool skew(SkScalar sx, SkScalar sy) SK_OVERRIDE;
+    virtual bool concat(const SkMatrix& matrix) SK_OVERRIDE;
+    virtual void setMatrix(const SkMatrix& matrix) SK_OVERRIDE;
+    virtual bool clipRect(const SkRect& rect, SkRegion::Op op,
+                          bool doAntiAlias) SK_OVERRIDE;
+    virtual bool clipPath(const SkPath& path, SkRegion::Op op,
+                          bool doAntiAlias) SK_OVERRIDE;
+    virtual bool clipRegion(const SkRegion& deviceRgn,
+                            SkRegion::Op op) SK_OVERRIDE;
+    virtual void clear(SkColor) SK_OVERRIDE;
+    virtual void drawPaint(const SkPaint& paint) SK_OVERRIDE;
+    virtual void drawPoints(PointMode mode, size_t count, const SkPoint pts[],
+                            const SkPaint& paint) SK_OVERRIDE;
+    virtual void drawRect(const SkRect& rect, const SkPaint& paint)
+                          SK_OVERRIDE;
+    virtual void drawPath(const SkPath& path, const SkPaint& paint)
+                          SK_OVERRIDE;
+    virtual void drawBitmap(const SkBitmap& bitmap, SkScalar left,
+                            SkScalar top, const SkPaint* paint)
+                            SK_OVERRIDE;
+    virtual void drawBitmapRect(const SkBitmap& bitmap, const SkIRect* src,
+                                const SkRect& dst, const SkPaint* paint)
+                                SK_OVERRIDE;
+
+    virtual void drawBitmapMatrix(const SkBitmap& bitmap, const SkMatrix& m,
+                                  const SkPaint* paint) SK_OVERRIDE;
+    virtual void drawBitmapNine(const SkBitmap& bitmap, const SkIRect& center,
+                                const SkRect& dst, const SkPaint* paint)
+                                SK_OVERRIDE;
+    virtual void drawSprite(const SkBitmap& bitmap, int left, int top,
+                            const SkPaint* paint) SK_OVERRIDE;
+    virtual void drawText(const void* text, size_t byteLength, SkScalar x,
+                          SkScalar y, const SkPaint& paint) SK_OVERRIDE;
+    virtual void drawPosText(const void* text, size_t byteLength,
+                             const SkPoint pos[], const SkPaint& paint)
+                             SK_OVERRIDE;
+    virtual void drawPosTextH(const void* text, size_t byteLength,
+                              const SkScalar xpos[], SkScalar constY,
+                              const SkPaint& paint) SK_OVERRIDE;
+    virtual void drawTextOnPath(const void* text, size_t byteLength,
+                                const SkPath& path, const SkMatrix* matrix,
+                                const SkPaint& paint) SK_OVERRIDE;
+    virtual void drawPicture(SkPicture& picture) SK_OVERRIDE;
+    virtual void drawVertices(VertexMode vmode, int vertexCount,
+                              const SkPoint vertices[], const SkPoint texs[],
+                              const SkColor colors[], SkXfermode* xmode,
+                              const uint16_t indices[], int indexCount,
+                              const SkPaint& paint) SK_OVERRIDE;
+    virtual SkBounder* setBounder(SkBounder* bounder) SK_OVERRIDE;
+    virtual SkDrawFilter* setDrawFilter(SkDrawFilter* filter) SK_OVERRIDE;
+
+private:
+    void flushIfNeeded(const SkBitmap& bitmap);
+
+public:
+    class DeviceContext : public SkRefCnt {
+    public:
+        virtual void prepareForDraw() {}
+    };
+
+public:
+    class DeferredDevice : public SkDevice {
+    public:
+        /**
+         *  Constructor
+         *  @param immediateDevice device to be drawn to when flushing
+         *      deferred operations
+         *  @param deviceContext callback interface for managing graphics
+         *      context state, can be NULL.
+         */
+        DeferredDevice(SkDevice* immediateDevice,
+            DeviceContext* deviceContext = NULL);
+        ~DeferredDevice();
+
+        /**
+         *  Sets the device context to be use with the device.
+         *  @param deviceContext callback interface for managing graphics
+         *      context state, can be NULL.
+         */
+        void setDeviceContext(DeviceContext* deviceContext);
+
+        /**
+         *  Returns the recording canvas.
+         */
+        SkCanvas* recordingCanvas() const {return fRecordingCanvas;}
+
+        /**
+         *  Returns the immediate (non deferred) canvas.
+         */
+        SkCanvas* immediateCanvas() const {return fImmediateCanvas;}
+
+        /**
+         *  Returns the immediate (non deferred) device.
+         */
+        SkDevice* immediateDevice() const {return fImmediateDevice;}
+
+        /**
+         *  Returns true if an opaque draw operation covering the entire canvas
+         *  was performed since the last call to isFreshFrame().
+         */
+        bool isFreshFrame();
+
+        void flushPending();
+        void contentsCleared();
+        void flushIfNeeded(const SkBitmap& bitmap);
+
+        virtual uint32_t getDeviceCapabilities() SK_OVERRIDE;
+        virtual int width() const SK_OVERRIDE;
+        virtual int height() const SK_OVERRIDE;
+        virtual SkGpuRenderTarget* accessRenderTarget() SK_OVERRIDE;
+
+        virtual SkDevice* onCreateCompatibleDevice(SkBitmap::Config config,
+                                                   int width, int height,
+                                                   bool isOpaque,
+                                                   Usage usage) SK_OVERRIDE;
+
+        virtual void writePixels(const SkBitmap& bitmap, int x, int y,
+                                 SkCanvas::Config8888 config8888) SK_OVERRIDE;
+
+    protected:
+        virtual const SkBitmap& onAccessBitmap(SkBitmap*) SK_OVERRIDE;
+        virtual bool onReadPixels(const SkBitmap& bitmap,
+                                  int x, int y,
+                                  SkCanvas::Config8888 config8888) SK_OVERRIDE;
+
+        // The following methods are no-ops on a deferred device
+        virtual bool filterTextFlags(const SkPaint& paint, TextFlags*)
+            SK_OVERRIDE
+            {return false;}
+        virtual void setMatrixClip(const SkMatrix&, const SkRegion&,
+                                   const SkClipStack&) SK_OVERRIDE
+            {}
+        virtual void gainFocus(SkCanvas*, const SkMatrix&, const SkRegion&,
+                               const SkClipStack&) SK_OVERRIDE
+            {}
+
+        // None of the following drawing methods should ever get called on the
+        // deferred device
+        virtual void clear(SkColor color)
+            {SkASSERT(0);}
+        virtual void drawPaint(const SkDraw&, const SkPaint& paint)
+            {SkASSERT(0);}
+        virtual void drawPoints(const SkDraw&, SkCanvas::PointMode mode,
+                                size_t count, const SkPoint[],
+                                const SkPaint& paint)
+            {SkASSERT(0);}
+        virtual void drawRect(const SkDraw&, const SkRect& r,
+                              const SkPaint& paint)
+            {SkASSERT(0);}
+        virtual void drawPath(const SkDraw&, const SkPath& path,
+                              const SkPaint& paint,
+                              const SkMatrix* prePathMatrix = NULL,
+                              bool pathIsMutable = false)
+            {SkASSERT(0);}
+        virtual void drawBitmap(const SkDraw&, const SkBitmap& bitmap,
+                                const SkIRect* srcRectOrNull,
+                                const SkMatrix& matrix, const SkPaint& paint)
+            {SkASSERT(0);}
+        virtual void drawSprite(const SkDraw&, const SkBitmap& bitmap,
+                                int x, int y, const SkPaint& paint)
+            {SkASSERT(0);}
+        virtual void drawText(const SkDraw&, const void* text, size_t len,
+                              SkScalar x, SkScalar y, const SkPaint& paint)
+            {SkASSERT(0);}
+        virtual void drawPosText(const SkDraw&, const void* text, size_t len,
+                                 const SkScalar pos[], SkScalar constY,
+                                 int scalarsPerPos, const SkPaint& paint)
+            {SkASSERT(0);}
+        virtual void drawTextOnPath(const SkDraw&, const void* text,
+                                    size_t len, const SkPath& path,
+                                    const SkMatrix* matrix,
+                                    const SkPaint& paint)
+            {SkASSERT(0);}
+        virtual void drawPosTextOnPath(const SkDraw& draw, const void* text,
+                                       size_t len, const SkPoint pos[],
+                                       const SkPaint& paint,
+                                       const SkPath& path,
+                                       const SkMatrix* matrix)
+            {SkASSERT(0);}
+        virtual void drawVertices(const SkDraw&, SkCanvas::VertexMode,
+                                  int vertexCount, const SkPoint verts[],
+                                  const SkPoint texs[], const SkColor colors[],
+                                  SkXfermode* xmode, const uint16_t indices[],
+                                  int indexCount, const SkPaint& paint)
+            {SkASSERT(0);}
+        virtual void drawDevice(const SkDraw&, SkDevice*, int x, int y,
+                                const SkPaint&)
+            {SkASSERT(0);}
+    private:
+        virtual void flush();
+
+        SkPicture fPicture;
+        SkDevice* fImmediateDevice;
+        SkCanvas* fImmediateCanvas;
+        SkCanvas* fRecordingCanvas;
+        DeviceContext* fDeviceContext;
+        bool fFreshFrame;
+    };
+
+    DeferredDevice* getDeferredDevice() const;
+
+protected:
+    virtual SkCanvas* canvasForDrawIter();
+
+private:
+    SkCanvas* drawingCanvas() const;
+    bool isFullFrame(const SkRect*, const SkPaint*) const;
+    void validate() const;
+    void init();
+    bool            fDeferredDrawing;
+
+    typedef SkCanvas INHERITED;
+};
+
+
+#endif
diff --git a/include/utils/SkDumpCanvas.h b/include/utils/SkDumpCanvas.h
index 5bfd6f6..de2af04 100644
--- a/include/utils/SkDumpCanvas.h
+++ b/include/utils/SkDumpCanvas.h
@@ -12,7 +12,7 @@
 
 /** This class overrides all the draw methods on SkCanvas, and formats them
     as text, and then sends that to a Dumper helper object.
- 
+
     Typical use might be to dump a display list to a log file to see what is
     being drawn.
  */
@@ -22,17 +22,17 @@
 
     explicit SkDumpCanvas(Dumper* = 0);
     virtual ~SkDumpCanvas();
-    
+
     enum Verb {
         kNULL_Verb,
 
         kSave_Verb,
         kRestore_Verb,
-        
+
         kMatrix_Verb,
-        
+
         kClip_Verb,
-        
+
         kDrawPaint_Verb,
         kDrawPoints_Verb,
         kDrawRect_Verb,
@@ -43,7 +43,7 @@
         kDrawVertices_Verb,
         kDrawData_Verb
     };
-    
+
     /** Subclasses of this are installed on the DumpCanvas, and then called for
         each drawing command.
      */
@@ -52,12 +52,12 @@
         virtual void dump(SkDumpCanvas*, SkDumpCanvas::Verb, const char str[],
                           const SkPaint*) = 0;
     };
-        
+
     Dumper* getDumper() const { return fDumper; }
     void    setDumper(Dumper*);
-    
+
     int getNestLevel() const { return fNestLevel; }
-    
+
     virtual int save(SaveFlags) SK_OVERRIDE;
     virtual int saveLayer(const SkRect* bounds, const SkPaint* paint,
                           SaveFlags) SK_OVERRIDE;
@@ -69,7 +69,7 @@
     virtual bool skew(SkScalar sx, SkScalar sy) SK_OVERRIDE;
     virtual bool concat(const SkMatrix& matrix) SK_OVERRIDE;
     virtual void setMatrix(const SkMatrix& matrix) SK_OVERRIDE;
-    
+
     virtual bool clipRect(const SkRect&, SkRegion::Op, bool) SK_OVERRIDE;
     virtual bool clipPath(const SkPath&, SkRegion::Op, bool) SK_OVERRIDE;
     virtual bool clipRegion(const SkRegion& deviceRgn,
@@ -109,7 +109,7 @@
 private:
     Dumper* fDumper;
     int     fNestLevel; // for nesting recursive elements like pictures
-    
+
     void dump(Verb, const SkPaint*, const char format[], ...);
 
     typedef SkCanvas INHERITED;
@@ -121,16 +121,16 @@
 class SkFormatDumper : public SkDumpCanvas::Dumper {
 public:
     SkFormatDumper(void (*)(const char text[], void* refcon), void* refcon);
-    
+
     // override from baseclass that does the formatting, and in turn calls
     // the function pointer that was passed to the constructor
     virtual void dump(SkDumpCanvas*, SkDumpCanvas::Verb, const char str[],
                       const SkPaint*) SK_OVERRIDE;
-    
+
 private:
     void (*fProc)(const char*, void*);
     void* fRefcon;
-    
+
     typedef SkDumpCanvas::Dumper INHERITED;
 };
 
diff --git a/include/utils/SkNWayCanvas.h b/include/utils/SkNWayCanvas.h
index 4e39c6b..69c1fcf 100644
--- a/include/utils/SkNWayCanvas.h
+++ b/include/utils/SkNWayCanvas.h
@@ -15,7 +15,7 @@
 public:
     SkNWayCanvas();
     virtual ~SkNWayCanvas();
-    
+
     void addCanvas(SkCanvas*);
     void removeCanvas(SkCanvas*);
     void removeAll();
@@ -70,10 +70,10 @@
 
     virtual SkBounder* setBounder(SkBounder*) SK_OVERRIDE;
     virtual SkDrawFilter* setDrawFilter(SkDrawFilter*) SK_OVERRIDE;
-    
+
 private:
     SkTDArray<SkCanvas*> fList;
-    
+
     class Iter;
 
     typedef SkCanvas INHERITED;
diff --git a/include/utils/SkProxyCanvas.h b/include/utils/SkProxyCanvas.h
index e96b9b2..720436b 100644
--- a/include/utils/SkProxyCanvas.h
+++ b/include/utils/SkProxyCanvas.h
@@ -13,7 +13,7 @@
 /** This class overrides all virtual methods on SkCanvas, and redirects them
     to a "proxy", another SkCanvas instance. It can be the basis for
     intercepting (and possibly modifying) calls to a canvas.
- 
+
     There must be a proxy installed before the proxycanvas can be used (i.e.
     before its virtual methods can be called).
  */
@@ -22,10 +22,10 @@
     SkProxyCanvas() : fProxy(NULL) {}
     SkProxyCanvas(SkCanvas* proxy);
     virtual ~SkProxyCanvas();
-    
+
     SkCanvas*   getProxy() const { return fProxy; }
     void        setProxy(SkCanvas* proxy);
-    
+
     virtual int save(SaveFlags flags = kMatrixClip_SaveFlag) SK_OVERRIDE;
     virtual int saveLayer(const SkRect* bounds, const SkPaint* paint,
                           SaveFlags flags = kARGB_ClipLayer_SaveFlag) SK_OVERRIDE;
@@ -37,7 +37,7 @@
     virtual bool skew(SkScalar sx, SkScalar sy) SK_OVERRIDE;
     virtual bool concat(const SkMatrix& matrix) SK_OVERRIDE;
     virtual void setMatrix(const SkMatrix& matrix) SK_OVERRIDE;
-    
+
     virtual bool clipRect(const SkRect&, SkRegion::Op, bool) SK_OVERRIDE;
     virtual bool clipPath(const SkPath&, SkRegion::Op, bool) SK_OVERRIDE;
     virtual bool clipRegion(const SkRegion& deviceRgn,
@@ -79,7 +79,7 @@
 
 private:
     SkCanvas*   fProxy;
-    
+
     typedef SkCanvas INHERITED;
 };
 
diff --git a/include/utils/mac/SkCGUtils.h b/include/utils/mac/SkCGUtils.h
index 055e24c..46f8996 100644
--- a/include/utils/mac/SkCGUtils.h
+++ b/include/utils/mac/SkCGUtils.h
@@ -25,8 +25,8 @@
  *  Create an imageref from the specified bitmap using the specified colorspace.
  *  If space is NULL, then CGColorSpaceCreateDeviceRGB() is used.
  */
-CGImageRef SkCreateCGImageRefWithColorspace(const SkBitmap& bm,
-                                            CGColorSpaceRef space);
+SK_API CGImageRef SkCreateCGImageRefWithColorspace(const SkBitmap& bm,
+                                                   CGColorSpaceRef space);
 
 /**
  *  Create an imageref from the specified bitmap using the colorspace returned
@@ -46,4 +46,19 @@
 
 bool SkPDFDocumentToBitmap(SkStream* stream, SkBitmap* output);
 
+/**
+ *  Return a provider that wraps the specified stream. It will become an
+ *  owner of the stream, so the caller must still manage its ownership.
+ *
+ *  To hand-off ownership of the stream to the provider, the caller must do
+ *  something like the following:
+ *
+ *  SkStream* stream = new ...;
+ *  CGDataProviderRef provider = SkStreamToDataProvider(stream);
+ *  stream->unref();
+ *
+ *  Now when the provider is finally deleted, it will delete the stream.
+ */
+CGDataProviderRef SkCreateDataProviderFromStream(SkStream*);
+
 #endif
diff --git a/samplecode/SampleApp.cpp b/samplecode/SampleApp.cpp
index 95923c8..51d42b6 100644
--- a/samplecode/SampleApp.cpp
+++ b/samplecode/SampleApp.cpp
@@ -22,7 +22,7 @@
 #include "GrContext.h"
 #include "SkTypeface.h"
 
-#include "GrGLInterface.h"
+#include "gl/GrGLInterface.h"
 #include "GrRenderTarget.h"
 
 #include "SkPDFDevice.h"
@@ -1544,6 +1544,17 @@
                 this->updateTitle();
             }
             return true;
+        case 'p':
+            {
+                GrContext* grContext = this->getGrContext();
+                if (grContext) {
+                    size_t cacheBytes = grContext->getGpuTextureCacheBytes();
+                    grContext->freeGpuResources();
+                    SkDebugf("Purged %d bytes from the GPU resource cache.\n",
+                             cacheBytes);
+                }
+            }
+            return true;
         case 's':
             fScale = !fScale;
             this->inval(NULL);
diff --git a/samplecode/SamplePath.cpp b/samplecode/SamplePath.cpp
index 7e2750a..ddfbb71 100644
--- a/samplecode/SamplePath.cpp
+++ b/samplecode/SamplePath.cpp
@@ -67,7 +67,8 @@
         SkIRect ir;
         r.round(&ir);
         printf("[%g %g %g %g] [%x %x %x %x]\n",
-               r.fLeft, r.fTop, r.fRight, r.fBottom,
+               SkScalarToDouble(r.fLeft), SkScalarToDouble(r.fTop),
+               SkScalarToDouble(r.fRight), SkScalarToDouble(r.fBottom),
                ir.fLeft, ir.fTop, ir.fRight, ir.fBottom);
     }
     
diff --git a/samplecode/SampleTextOnPath.cpp b/samplecode/SampleTextOnPath.cpp
index f3c98f8..2b73e9a 100644
--- a/samplecode/SampleTextOnPath.cpp
+++ b/samplecode/SampleTextOnPath.cpp
@@ -117,7 +117,7 @@
     paint.setAntiAlias(true);
     paint.setDevKernText(gDevKern);
     
-    (void)paint.measureText(gText, strlen(gText), &bounds, NULL);
+    (void)paint.measureText(gText, strlen(gText), &bounds);
     paint.setColor(SK_ColorGREEN);
     bounds.offset(x, y);
     canvas->drawRect(bounds, paint);
@@ -288,4 +288,3 @@
 }
 
 static SkViewRegister reg(MyFactory);
-
diff --git a/src/animator/SkDisplayEvent.cpp b/src/animator/SkDisplayEvent.cpp
index e3116d7..7d2e786 100644
--- a/src/animator/SkDisplayEvent.cpp
+++ b/src/animator/SkDisplayEvent.cpp
@@ -262,36 +262,35 @@
 #include "SkTextBox.h"
 #include "SkXMLWriter.h"
 
-void SkMetaData::setPtr(char const*, void* ) {}
+void SkMetaData::setPtr(char const*, void*, PtrProc ) {}
 void SkMetaData::setS32(char const*, int ) {}
 bool SkEventSink::doEvent(SkEvent const& ) { return false; }
 bool SkXMLParser::parse(SkStream& ) { return false; }
 SkXMLParserError::SkXMLParserError( ) {}
-void SkEvent::setType(char const*, unsigned long ) {}
-bool SkEvent::PostTime(SkEvent*, unsigned int, unsigned int ) { return false; }
-SkEvent::SkEvent(char const* ) {}
-SkEvent::SkEvent(SkEvent const& ) {}
+void SkEvent::setType(char const*, size_t ) {}
+void SkEvent::postTime(SkMSec) {}
+SkEvent::SkEvent(char const*, SkEventSinkID) {}
+SkEvent::SkEvent(SkEvent const&) {}
 SkEvent::SkEvent( ) {}
 SkEvent::~SkEvent( ) {}
 bool SkEventSink::onQuery(SkEvent* ) { return false; }
 SkEventSink::SkEventSink( ) {}
 SkEventSink::~SkEventSink( ) {}
-bool SkXMLParser::parse(char const*, unsigned long ) { return false; }
+bool SkXMLParser::parse(char const*, size_t ) { return false; }
 bool SkXMLParser::parse(SkDOM const&, SkDOMNode const* ) { return false; }
-bool SkEvent::Post(SkEvent*, unsigned int, unsigned int ) { return false; }
-void SkParse::UnitTest( ) {}
-const char* SkMetaData::findString(char const*) const {return 0;}
-bool SkMetaData::findPtr(char const*, void**) const {return false;}
-bool SkMetaData::findS32(char const*, int*) const {return false;}
-bool SkEvent::isType(char const*, unsigned long) const { return false; }
+//void SkParse::UnitTest( ) {}
+const char* SkMetaData::findString(char const* ) const {return 0;}
+bool SkMetaData::findPtr(char const*, void**, PtrProc* ) const {return false;}
+bool SkMetaData::findS32(char const*, int* ) const {return false;}
+bool SkEvent::isType(char const*, size_t ) const { return false; }
 void SkMetaData::setString(char const*, char const* ) {}
-const char* SkParse::FindNamedColor(char const*, unsigned long, unsigned int* ) {return false; }
+const char* SkParse::FindNamedColor(char const*, size_t, SkColor* ) {return false; }
 const char* SkMetaData::Iter::next(SkMetaData::Type*, int* ) { return false; }
 SkMetaData::Iter::Iter(SkMetaData const& ) {}
-bool SkMetaData::findScalar(char const*, int*) const {return false;}
+bool SkMetaData::findScalar(char const*, SkScalar* ) const {return false;}
 void SkMetaData::reset( ) {}
 void SkEvent::setType(SkString const& ) {}
-bool SkMetaData::findBool(char const*, bool*) const {return false;}
+bool SkMetaData::findBool(char const*, bool* ) const {return false;}
 void SkEvent::getType(SkString*) const {}
 bool SkXMLParser::endElement(char const* ) { return false; }
 bool SkXMLParser::addAttribute(char const*, char const* ) { return false;}
@@ -302,22 +301,22 @@
 SkXMLParser::~SkXMLParser( ) {}
 SkXMLParserError::~SkXMLParserError( ) {}
 void SkXMLParserError::getErrorString(SkString*) const {}
-void SkTextBox::setSpacing(int, int ) {}
+void SkTextBox::setSpacing(SkScalar, SkScalar ) {}
 void SkTextBox::setSpacingAlign(SkTextBox::SpacingAlign ) {}
-void SkTextBox::draw(SkCanvas*, char const*, unsigned long, SkPaint const& ) {}
+void SkTextBox::draw(SkCanvas*, char const*, size_t, SkPaint const& ) {}
 void SkTextBox::setBox(SkRect const& ) {}
 void SkTextBox::setMode(SkTextBox::Mode ) {}
 SkTextBox::SkTextBox( ) {}
-void SkMetaData::setScalar(char const*, int ) {}
-const char* SkParse::FindScalar(char const*, int* ) {return 0; }
-const char* SkParse::FindScalars(char const*, int*, int ) {return 0; }
+void SkMetaData::setScalar(char const*, SkScalar ) {}
+const char* SkParse::FindScalar(char const*, SkScalar* ) {return 0; }
+const char* SkParse::FindScalars(char const*, SkScalar*, int ) {return 0; }
 const char* SkParse::FindHex(char const*, unsigned int* ) {return 0; }
 const char* SkParse::FindS32(char const*, int* ) {return 0; }
 void SkXMLWriter::addAttribute(char const*, char const* ) {}
 void SkXMLWriter::startElement(char const* ) {}
 void SkXMLWriter::doEnd(SkXMLWriter::Elem* ) {}
 SkXMLWriter::Elem* SkXMLWriter::getEnd( ) { return 0; }
-bool SkXMLWriter::doStart(char const*, unsigned long ) { return false; }
+bool SkXMLWriter::doStart(char const*, size_t ) { return false; }
 SkXMLWriter::SkXMLWriter(bool ) {}
 SkXMLWriter::~SkXMLWriter( ) {}
 SkMetaData::SkMetaData() {}
diff --git a/src/core/SkAAClip.cpp b/src/core/SkAAClip.cpp
index 096fd6b..64c2728 100644
--- a/src/core/SkAAClip.cpp
+++ b/src/core/SkAAClip.cpp
@@ -893,10 +893,10 @@
         SkASSERT(count > 0);
         SkASSERT(fBounds.contains(x, y));
         SkASSERT(fBounds.contains(x + count - 1, y));
-
+        
         x -= fBounds.left();
         y -= fBounds.top();
-
+                             
         Row* row = fCurrRow;
         if (y != fPrevY) {
             SkASSERT(y > fPrevY);
@@ -1142,12 +1142,33 @@
 };
 
 class SkAAClip::BuilderBlitter : public SkBlitter {
+    int fLastY;
+
+    /*
+        If we see a gap of 1 or more empty scanlines while building in Y-order,
+        we inject an explicit empty scanline (alpha==0)
+     
+        See AAClipTest.cpp : test_path_with_hole()
+     */
+    void checkForYGap(int y) {
+        SkASSERT(y >= fLastY);
+        if (fLastY > -SK_MaxS32) {
+            int gap = y - fLastY;
+            if (gap > 1) {
+                fBuilder->addRun(fLeft, y - 1, 0, fRight - fLeft);
+            }
+        }
+        fLastY = y;
+    }
+
 public:
+
     BuilderBlitter(Builder* builder) {
         fBuilder = builder;
         fLeft = builder->getBounds().fLeft;
         fRight = builder->getBounds().fRight;
         fMinY = SK_MaxS32;
+        fLastY = -SK_MaxS32;    // sentinel
     }
 
     void finish() {
@@ -1166,17 +1187,22 @@
     virtual void blitV(int x, int y, int height, SkAlpha alpha) SK_OVERRIDE {
         this->recordMinY(y);
         fBuilder->addColumn(x, y, alpha, height);
+        fLastY = y + height - 1;
     }
 
     virtual void blitRect(int x, int y, int width, int height) SK_OVERRIDE {
         this->recordMinY(y);
+        this->checkForYGap(y);
         fBuilder->addRectRun(x, y, width, height);
+        fLastY = y + height - 1;
     }
 
     virtual void blitAntiRect(int x, int y, int width, int height,
                      SkAlpha leftAlpha, SkAlpha rightAlpha) SK_OVERRIDE {
         this->recordMinY(y);
+        this->checkForYGap(y);
         fBuilder->addAntiRectRun(x, y, width, height, leftAlpha, rightAlpha);
+        fLastY = y + height - 1;
     }
 
     virtual void blitMask(const SkMask&, const SkIRect& clip) SK_OVERRIDE
@@ -1188,12 +1214,14 @@
 
     virtual void blitH(int x, int y, int width) SK_OVERRIDE {
         this->recordMinY(y);
+        this->checkForYGap(y);
         fBuilder->addRun(x, y, 0xFF, width);
     }
 
     virtual void blitAntiH(int x, int y, const SkAlpha alpha[],
                            const int16_t runs[]) SK_OVERRIDE {
         this->recordMinY(y);
+        this->checkForYGap(y);
         for (;;) {
             int count = *runs;
             if (count <= 0) {
diff --git a/src/core/SkBitmap.cpp b/src/core/SkBitmap.cpp
index 2b3e7c4..58d0bd8 100644
--- a/src/core/SkBitmap.cpp
+++ b/src/core/SkBitmap.cpp
@@ -821,6 +821,7 @@
 
     SkBitmap dst;
     dst.setConfig(this->config(), r.width(), r.height(), this->rowBytes());
+    dst.setIsVolatile(this->isVolatile());
 
     if (fPixelRef) {
         // share the pixelref with a custom offset
diff --git a/src/core/SkBitmapProcState.h b/src/core/SkBitmapProcState.h
index 98c8782..c04992b 100644
--- a/src/core/SkBitmapProcState.h
+++ b/src/core/SkBitmapProcState.h
@@ -136,5 +136,13 @@
                               int count, SkPMColor colors[]);
 void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, const uint32_t xy[],
                              int count, SkPMColor colors[]);
+void ClampX_ClampY_filter_scale(const SkBitmapProcState& s, uint32_t xy[],
+                                int count, int x, int y);
+void ClampX_ClampY_nofilter_scale(const SkBitmapProcState& s, uint32_t xy[],
+                                  int count, int x, int y);
+void ClampX_ClampY_filter_affine(const SkBitmapProcState& s,
+                                 uint32_t xy[], int count, int x, int y);
+void ClampX_ClampY_nofilter_affine(const SkBitmapProcState& s,
+                                   uint32_t xy[], int count, int x, int y);
 
 #endif
diff --git a/src/core/SkBitmapProcState_matrixProcs.cpp b/src/core/SkBitmapProcState_matrixProcs.cpp
index d0bc8d8..bda2438 100644
--- a/src/core/SkBitmapProcState_matrixProcs.cpp
+++ b/src/core/SkBitmapProcState_matrixProcs.cpp
@@ -1,4 +1,8 @@
-/* NEON optimized code (C) COPYRIGHT 2009 Motorola */
+/* NEON optimized code (C) COPYRIGHT 2009 Motorola 
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
 
 #include "SkBitmapProcState.h"
 #include "SkPerspIter.h"
diff --git a/src/core/SkBitmapProcState_matrix_clamp.h b/src/core/SkBitmapProcState_matrix_clamp.h
index e2191eb..06bc0fa 100644
--- a/src/core/SkBitmapProcState_matrix_clamp.h
+++ b/src/core/SkBitmapProcState_matrix_clamp.h
@@ -1,4 +1,9 @@
-/* NEON optimized code (C) COPYRIGHT 2009 Motorola */
+/* NEON optimized code (C) COPYRIGHT 2009 Motorola
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
 /*
  * Modifications done in-house at Motorola 
  *
diff --git a/src/core/SkBitmapProcState_matrix_repeat.h b/src/core/SkBitmapProcState_matrix_repeat.h
index 7c97a03..a7fa089 100644
--- a/src/core/SkBitmapProcState_matrix_repeat.h
+++ b/src/core/SkBitmapProcState_matrix_repeat.h
@@ -1,4 +1,9 @@
-/* NEON optimized code (C) COPYRIGHT 2009 Motorola */
+/* NEON optimized code (C) COPYRIGHT 2009 Motorola
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+ 
 /*
  * Modifications done in-house at Motorola 
  *
diff --git a/src/core/SkBlitMask.h b/src/core/SkBlitMask.h
index 299f6d1..9c0fe0f 100644
--- a/src/core/SkBlitMask.h
+++ b/src/core/SkBlitMask.h
@@ -29,6 +29,15 @@
     typedef void (*ColorProc)(void* dst, size_t dstRB,
                               const void* mask, size_t maskRB,
                               SkColor color, int width, int height);
+    
+    /**
+     *  Function pointer that blits a row of mask(lcd16) into a row of dst 
+     *  colorized by a single color. The number of pixels to blit is specified
+     *  by width.
+     */
+    typedef void (*BlitLCD16RowProc)(SkPMColor dst[], const uint16_t src[],
+                                     SkColor color, int width, 
+                                     SkPMColor opaqueDst);
 
     /**
      *  Function pointer that blits a row of src colors through a row of a mask
@@ -49,6 +58,17 @@
      *  or NULL if no optimized routine is available.
      */
     static ColorProc PlatformColorProcs(SkBitmap::Config, SkMask::Format, SkColor);
+    
+    /**
+     *  Public entry-point to return a blitcolor BlitLCD16RowProc.
+     */
+    static BlitLCD16RowProc BlitLCD16RowFactory(bool isOpaque);
+
+    /**
+     *  Return either platform specific optimized blitcolor BlitLCD16RowProc,
+     *  or NULL if no optimized routine is available.
+     */
+    static BlitLCD16RowProc PlatformBlitRowProcs16(bool isOpaque);
 
     enum RowFlags {
         kSrcIsOpaque_RowFlag    = 1 << 0
diff --git a/src/core/SkBlitMask_D32.cpp b/src/core/SkBlitMask_D32.cpp
index 341627a..c97e9e6 100644
--- a/src/core/SkBlitMask_D32.cpp
+++ b/src/core/SkBlitMask_D32.cpp
@@ -64,106 +64,16 @@
     } while (--height != 0);
 }
 
-///////////////////////////////////////////////////////////////////////////////
-
-static inline int upscale31To32(int value) {
-    SkASSERT((unsigned)value <= 31);
-    return value + (value >> 4);
-}
-
-static inline int blend32(int src, int dst, int scale) {
-    SkASSERT((unsigned)src <= 0xFF);
-    SkASSERT((unsigned)dst <= 0xFF);
-    SkASSERT((unsigned)scale <= 32);
-    return dst + ((src - dst) * scale >> 5);
-}
-
-static void blit_lcd16_row(SkPMColor dst[], const uint16_t src[],
-                           SkColor color, int width, SkPMColor) {
-    int srcA = SkColorGetA(color);
-    int srcR = SkColorGetR(color);
-    int srcG = SkColorGetG(color);
-    int srcB = SkColorGetB(color);
-    
-    srcA = SkAlpha255To256(srcA);
-    
-    for (int i = 0; i < width; i++) {
-        uint16_t mask = src[i];
-        if (0 == mask) {
-            continue;
-        }
-        
-        SkPMColor d = dst[i];
-        
-        /*  We want all of these in 5bits, hence the shifts in case one of them
-         *  (green) is 6bits.
-         */
-        int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
-        int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
-        int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
-        
-        // Now upscale them to 0..32, so we can use blend32
-        maskR = upscale31To32(maskR);
-        maskG = upscale31To32(maskG);
-        maskB = upscale31To32(maskB);
-        
-        maskR = maskR * srcA >> 8;
-        maskG = maskG * srcA >> 8;
-        maskB = maskB * srcA >> 8;
-        
-        int dstR = SkGetPackedR32(d);
-        int dstG = SkGetPackedG32(d);
-        int dstB = SkGetPackedB32(d);
-        
-        // LCD blitting is only supported if the dst is known/required
-        // to be opaque
-        dst[i] = SkPackARGB32(0xFF,
-                              blend32(srcR, dstR, maskR),
-                              blend32(srcG, dstG, maskG),
-                              blend32(srcB, dstB, maskB));
+SkBlitMask::BlitLCD16RowProc SkBlitMask::BlitLCD16RowFactory(bool isOpaque) {
+    BlitLCD16RowProc proc = PlatformBlitRowProcs16(isOpaque);
+    if (proc) {
+        return proc;
     }
-}
-
-static void blit_lcd16_opaque_row(SkPMColor dst[], const uint16_t src[],
-                                  SkColor color, int width, SkPMColor opaqueDst) {
-    int srcR = SkColorGetR(color);
-    int srcG = SkColorGetG(color);
-    int srcB = SkColorGetB(color);
     
-    for (int i = 0; i < width; i++) {
-        uint16_t mask = src[i];
-        if (0 == mask) {
-            continue;
-        }
-        if (0xFFFF == mask) {
-            dst[i] = opaqueDst;
-            continue;
-        }
-        
-        SkPMColor d = dst[i];
-        
-        /*  We want all of these in 5bits, hence the shifts in case one of them
-         *  (green) is 6bits.
-         */
-        int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
-        int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
-        int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
-        
-        // Now upscale them to 0..32, so we can use blend32
-        maskR = upscale31To32(maskR);
-        maskG = upscale31To32(maskG);
-        maskB = upscale31To32(maskB);
-        
-        int dstR = SkGetPackedR32(d);
-        int dstG = SkGetPackedG32(d);
-        int dstB = SkGetPackedB32(d);
-        
-        // LCD blitting is only supported if the dst is known/required
-        // to be opaque
-        dst[i] = SkPackARGB32(0xFF,
-                              blend32(srcR, dstR, maskR),
-                              blend32(srcG, dstG, maskG),
-                              blend32(srcB, dstB, maskB));
+    if (isOpaque) {
+        return  SkBlitLCD16OpaqueRow;
+    } else {
+        return  SkBlitLCD16Row;
     }
 }
 
@@ -175,13 +85,14 @@
     const uint16_t* srcRow = (const uint16_t*)mask;
     SkPMColor       opaqueDst;
     
-    void (*proc)(SkPMColor dst[], const uint16_t src[],
-                 SkColor color, int width, SkPMColor);
-    if (0xFF == SkColorGetA(color)) {
-        proc = blit_lcd16_opaque_row;
+    SkBlitMask::BlitLCD16RowProc proc = NULL;
+    bool isOpaque = (0xFF == SkColorGetA(color));
+    proc = SkBlitMask::BlitLCD16RowFactory(isOpaque);
+    SkASSERT(proc != NULL);
+
+    if (isOpaque) {
         opaqueDst = SkPreMultiplyColor(color);
     } else {
-        proc = blit_lcd16_row;
         opaqueDst = 0;  // ignored
     }
     
@@ -546,9 +457,9 @@
         int maskB = SkGetPackedB16(m) >> (SK_B16_BITS - 5);
         
         // Now upscale them to 0..32, so we can use blend32
-        maskR = upscale31To32(maskR);
-        maskG = upscale31To32(maskG);
-        maskB = upscale31To32(maskB);
+        maskR = SkUpscale31To32(maskR);
+        maskG = SkUpscale31To32(maskG);
+        maskB = SkUpscale31To32(maskB);
         
         int dstR = SkGetPackedR32(d);
         int dstG = SkGetPackedG32(d);
@@ -557,9 +468,9 @@
         // LCD blitting is only supported if the dst is known/required
         // to be opaque
         dst[i] = SkPackARGB32(0xFF,
-                              blend32(srcR, dstR, maskR),
-                              blend32(srcG, dstG, maskG),
-                              blend32(srcB, dstB, maskB));
+                              SkBlend32(srcR, dstR, maskR),
+                              SkBlend32(srcG, dstG, maskG),
+                              SkBlend32(srcB, dstB, maskB));
     }
 }
 
diff --git a/src/core/SkBlitter.cpp b/src/core/SkBlitter.cpp
index ec16066..df25b7c 100644
--- a/src/core/SkBlitter.cpp
+++ b/src/core/SkBlitter.cpp
@@ -247,8 +247,6 @@
         }
         width += count;
         runs += count;
-
-        SkASSERT(width < 20000);
     }
     return width;
 }
diff --git a/src/core/SkCanvas.cpp b/src/core/SkCanvas.cpp
index edfb3a9..aa92dd0 100644
--- a/src/core/SkCanvas.cpp
+++ b/src/core/SkCanvas.cpp
@@ -214,6 +214,7 @@
 class SkDrawIter : public SkDraw {
 public:
     SkDrawIter(SkCanvas* canvas, bool skipEmptyClips = true) {
+        canvas = canvas->canvasForDrawIter();
         fCanvas = canvas;
         canvas->updateDeviceCMCache();
 
@@ -231,8 +232,8 @@
             }
         }
 
-        if (NULL != fCurrLayer) {
-            const DeviceCM* rec = fCurrLayer;
+        const DeviceCM* rec = fCurrLayer;
+        if (rec && rec->fDevice) {
 
             fMatrix = rec->fMatrix;
             fClip   = &((SkRasterClip*)&rec->fClip)->forceGetBW();
@@ -288,18 +289,18 @@
             fLooper->init(canvas);
         }
     }
-    
+
     ~AutoDrawLooper() {
         SkASSERT(fCanvas->getSaveCount() == fSaveCount);
     }
-    
+
     const SkPaint& paint() const {
         SkASSERT(fPaint);
         return *fPaint;
     }
-    
+
     bool next(SkDrawFilter::Type drawType);
-    
+
 private:
     SkLazyPaint     fLazyPaint;
     SkCanvas*       fCanvas;
@@ -423,7 +424,7 @@
 SkCanvas::SkCanvas()
 : fMCStack(sizeof(MCRec), fMCRecStorage, sizeof(fMCRecStorage)) {
     inc_canvas();
-    
+
     this->init(NULL);
 }
 
@@ -469,6 +470,13 @@
 
 ///////////////////////////////////////////////////////////////////////////////
 
+void SkCanvas::flush() {
+    SkDevice* device = this->getDevice();
+    if (device) {
+        device->flush();
+    }
+}
+
 SkISize SkCanvas::getDeviceSize() const {
     SkDevice* d = this->getDevice();
     return d ? SkISize::Make(d->width(), d->height()) : SkISize::Make(0, 0);
@@ -565,7 +573,7 @@
 
 bool SkCanvas::readPixels(const SkIRect& srcRect, SkBitmap* bitmap) {
     SkDevice* device = this->getDevice();
-    
+
     SkIRect bounds;
     bounds.set(0, 0, device->width(), device->height());
     if (!bounds.intersect(srcRect)) {
@@ -591,6 +599,10 @@
     }
 }
 
+SkCanvas* SkCanvas::canvasForDrawIter() {
+    return this;
+}
+
 //////////////////////////////////////////////////////////////////////////////
 
 void SkCanvas::updateDeviceCMCache() {
@@ -695,19 +707,12 @@
     return (flags & SkCanvas::kClipToLayer_SaveFlag) != 0;
 }
 
-int SkCanvas::saveLayer(const SkRect* bounds, const SkPaint* paint,
-                        SaveFlags flags) {
-    // do this before we create the layer. We don't call the public save() since
-    // that would invoke a possibly overridden virtual
-    int count = this->internalSave(flags);
-
-    fDeviceCMDirty = true;
-
+bool SkCanvas::clipRectBounds(const SkRect* bounds, SaveFlags flags,
+                               SkIRect* intersection) {
     SkIRect clipBounds;
     if (!this->getClipDeviceBounds(&clipBounds)) {
-        return count;
+        return false;
     }
-
     SkIRect ir;
     if (NULL != bounds) {
         SkRect r;
@@ -719,16 +724,36 @@
             if (bounds_affects_clip(flags)) {
                 fMCRec->fRasterClip->setEmpty();
             }
-            return count;
+            return false;
         }
     } else {    // no user bounds, so just use the clip
         ir = clipBounds;
     }
 
     fClipStack.clipDevRect(ir, SkRegion::kIntersect_Op);
+
     // early exit if the clip is now empty
     if (bounds_affects_clip(flags) &&
         !fMCRec->fRasterClip->op(ir, SkRegion::kIntersect_Op)) {
+        return false;
+    }
+
+    if (intersection) {
+        *intersection = ir;
+    }
+    return true;
+}
+
+int SkCanvas::saveLayer(const SkRect* bounds, const SkPaint* paint,
+                        SaveFlags flags) {
+    // do this before we create the layer. We don't call the public save() since
+    // that would invoke a possibly overridden virtual
+    int count = this->internalSave(flags);
+
+    fDeviceCMDirty = true;
+
+    SkIRect ir;
+    if (!this->clipRectBounds(bounds, flags, &ir)) {
         return count;
     }
 
@@ -836,7 +861,7 @@
     if (count < 1) {
         count = 1;
     }
-    
+
     int n = this->getSaveCount() - count;
     for (int i = 0; i < n; ++i) {
         this->restore();
@@ -876,7 +901,7 @@
 class DeviceImageFilterProxy : public SkImageFilter::Proxy {
 public:
     DeviceImageFilterProxy(SkDevice* device) : fDevice(device) {}
-    
+
     virtual SkDevice* createDevice(int w, int h) SK_OVERRIDE;
     virtual bool filterImage(SkImageFilter*, const SkBitmap& src,
                              const SkMatrix& ctm,
@@ -1032,8 +1057,8 @@
             return currClip->op(clip, op);
         }
     } else {
-        const SkBitmap& bm = canvas->getDevice()->accessBitmap(false);
-        base.setRect(0, 0, bm.width(), bm.height());
+        const SkDevice* device = canvas->getDevice();
+        base.setRect(0, 0, device->width(), device->height());
 
         if (SkRegion::kReplace_Op == op) {
             return currClip->setPath(devPath, base, doAA);
@@ -1172,28 +1197,15 @@
     return path.isEmpty() || this->quickReject(path.getBounds(), et);
 }
 
-bool SkCanvas::quickRejectY(SkScalar top, SkScalar bottom, EdgeType et) const {
-    /*  current impl ignores edgetype, and relies on
-        getLocalClipBoundsCompareType(), which always returns a value assuming
-        antialiasing (worst case)
-     */
-
-    if (fMCRec->fRasterClip->isEmpty()) {
-        return true;
+static inline int pinIntForScalar(int x) {
+#ifdef SK_SCALAR_IS_FIXED
+    if (x < SK_MinS16) {
+        x = SK_MinS16;
+    } else if (x > SK_MaxS16) {
+        x = SK_MaxS16;
     }
-
-    SkScalarCompareType userT = SkScalarToCompareType(top);
-    SkScalarCompareType userB = SkScalarToCompareType(bottom);
-
-    // check for invalid user Y coordinates (i.e. empty)
-    // reed: why do we need to do this check, since it slows us down?
-    if (userT >= userB) {
-        return true;
-    }
-
-    // check if we are above or below the local clip bounds
-    const SkRectCompareType& clipR = this->getLocalClipBoundsCompareType();
-    return userT >= clipR.fBottom || userB <= clipR.fTop;
+#endif
+    return x;
 }
 
 bool SkCanvas::getClipBounds(SkRect* bounds, EdgeType et) const {
@@ -1215,8 +1227,16 @@
         SkRect r;
         // adjust it outwards if we are antialiasing
         int inset = (kAA_EdgeType == et);
-        r.iset(ibounds.fLeft - inset,  ibounds.fTop - inset,
-               ibounds.fRight + inset, ibounds.fBottom + inset);
+
+        // SkRect::iset() will correctly assert if we pass a value out of range
+        // (when SkScalar==fixed), so we pin to legal values. This does not
+        // really returnt the correct answer, but its the best we can do given
+        // that we've promised to return SkRect (even though we support devices
+        // that can be larger than 32K in width or height).
+        r.iset(pinIntForScalar(ibounds.fLeft - inset),
+               pinIntForScalar(ibounds.fTop - inset),
+               pinIntForScalar(ibounds.fRight + inset), 
+               pinIntForScalar(ibounds.fBottom + inset));
         inverse.mapRect(bounds, r);
     }
     return true;
@@ -1282,7 +1302,7 @@
     }
 }
 
-SkDevice* SkCanvas::createCompatibleDevice(SkBitmap::Config config, 
+SkDevice* SkCanvas::createCompatibleDevice(SkBitmap::Config config,
                                            int width, int height,
                                            bool isOpaque) {
     SkDevice* device = this->getDevice();
@@ -1384,18 +1404,18 @@
                           const SkPaint* paint) {
     SkDEBUGCODE(bitmap.validate();)
 
-    SkRect bounds;
-    bounds.set(x, y,
-               x + SkIntToScalar(bitmap.width()),
-               y + SkIntToScalar(bitmap.height()));
-    if (NULL == paint) {
+    if (NULL == paint || paint->canComputeFastBounds()) {
+        SkRect bounds = {
+            x, y,
+            x + SkIntToScalar(bitmap.width()),
+            y + SkIntToScalar(bitmap.height())
+        };
+        if (paint) {
+            (void)paint->computeFastBounds(bounds, &bounds);
+        }
         if (this->quickReject(bounds, paint2EdgeType(paint))) {
             return;
         }
-    } else if (paint->canComputeFastBounds() &&
-               this->quickReject(paint->computeFastBounds(bounds, &bounds),
-                                 paint2EdgeType(paint))) {
-        return;
     }
 
     SkMatrix matrix;
@@ -1409,22 +1429,21 @@
     if (bitmap.width() == 0 || bitmap.height() == 0 || dst.isEmpty()) {
         return;
     }
-    
+
     // do this now, to avoid the cost of calling extract for RLE bitmaps
-    if (NULL == paint) {
-        if (this->quickReject(dst, paint2EdgeType(paint))) {
-            return;
+    if (NULL == paint || paint->canComputeFastBounds()) {
+        SkRect storage;
+        const SkRect* bounds = &dst;
+        if (paint) {
+            bounds = &paint->computeFastBounds(dst, &storage);
         }
-    } else if (paint->canComputeFastBounds()) {
-        SkRect fastBounds;
-        if (this->quickReject(paint->computeFastBounds(dst, &fastBounds),
-                              paint2EdgeType(paint))) {
+        if (this->quickReject(*bounds, paint2EdgeType(paint))) {
             return;
         }
     }
 
     const SkBitmap* bitmapPtr = &bitmap;
-    
+
     SkMatrix matrix;
     SkRect tmpSrc;
     if (src) {
@@ -1444,7 +1463,7 @@
                    SkIntToScalar(bitmap.height()));
     }
     matrix.setRectToRect(tmpSrc, dst, SkMatrix::kFill_ScaleToFit);
-    
+
     // ensure that src is "valid" before we pass it to our internal routines
     // and to SkDevice. i.e. sure it is contained inside the original bitmap.
     SkIRect tmpISrc;
@@ -1486,14 +1505,13 @@
 void SkCanvas::internalDrawBitmapNine(const SkBitmap& bitmap,
                                       const SkIRect& center, const SkRect& dst,
                                       const SkPaint* paint) {
-    if (NULL == paint) {
-        if (this->quickReject(dst, paint2EdgeType(paint))) {
-            return;
+    if (NULL == paint || paint->canComputeFastBounds()) {
+        SkRect storage;
+        const SkRect* bounds = &dst;
+        if (paint) {
+            bounds = &paint->computeFastBounds(dst, &storage);
         }
-    } else if (paint->canComputeFastBounds()) {
-        SkRect fastBounds;
-        if (this->quickReject(paint->computeFastBounds(dst, &fastBounds),
-                              paint2EdgeType(paint))) {
+        if (this->quickReject(*bounds, paint2EdgeType(paint))) {
             return;
         }
     }
@@ -1518,17 +1536,17 @@
         dst.fTop, dst.fTop + SkIntToScalar(c.fTop),
         dst.fBottom - SkIntToScalar(h - c.fBottom), dst.fBottom
     };
-    
+
     if (dstX[1] > dstX[2]) {
         dstX[1] = dstX[0] + (dstX[3] - dstX[0]) * c.fLeft / (w - c.width());
         dstX[2] = dstX[1];
     }
-    
+
     if (dstY[1] > dstY[2]) {
         dstY[1] = dstY[0] + (dstY[3] - dstY[0]) * c.fTop / (h - c.height());
         dstY[2] = dstY[1];
     }
-    
+
     SkIRect s;
     SkRect  d;
     for (int y = 0; y < 3; y++) {
diff --git a/src/core/SkConfig8888.cpp b/src/core/SkConfig8888.cpp
new file mode 100644
index 0000000..10a1b36
--- /dev/null
+++ b/src/core/SkConfig8888.cpp
@@ -0,0 +1,281 @@
+#include "SkConfig8888.h"
+
+namespace {
+
+template <int A_IDX, int R_IDX, int G_IDX, int B_IDX>
+inline uint32_t pack_config8888(uint32_t a, uint32_t r,
+                                uint32_t g, uint32_t b) {
+#ifdef SK_CPU_LENDIAN
+    return (a << (A_IDX * 8)) | (r << (R_IDX * 8)) |
+           (g << (G_IDX * 8)) | (b << (B_IDX * 8));
+#else
+    return (a << ((3-A_IDX) * 8)) | (r << ((3-R_IDX) * 8)) |
+           (g << ((3-G_IDX) * 8)) | (b << ((3-B_IDX) * 8));
+#endif
+}
+
+template <int A_IDX, int R_IDX, int G_IDX, int B_IDX>
+inline void unpack_config8888(uint32_t color,
+                              uint32_t* a, uint32_t* r,
+                              uint32_t* g, uint32_t* b) {
+#ifdef SK_CPU_LENDIAN
+    *a = (color >> (A_IDX * 8)) & 0xff;
+    *r = (color >> (R_IDX * 8)) & 0xff;
+    *g = (color >> (G_IDX * 8)) & 0xff;
+    *b = (color >> (B_IDX * 8)) & 0xff;
+#else
+    *a = (color >> ((3 - A_IDX) * 8)) & 0xff;
+    *r = (color >> ((3 - R_IDX) * 8)) & 0xff;
+    *g = (color >> ((3 - G_IDX) * 8)) & 0xff;
+    *b = (color >> ((3 - B_IDX) * 8)) & 0xff;
+#endif
+}
+
+#ifdef SK_CPU_LENDIAN
+    static const int SK_NATIVE_A_IDX = SK_A32_SHIFT / 8;
+    static const int SK_NATIVE_R_IDX = SK_R32_SHIFT / 8;
+    static const int SK_NATIVE_G_IDX = SK_G32_SHIFT / 8;
+    static const int SK_NATIVE_B_IDX = SK_B32_SHIFT / 8;
+#else
+    static const int SK_NATIVE_A_IDX = 3 - (SK_A32_SHIFT / 8);
+    static const int SK_NATIVE_R_IDX = 3 - (SK_R32_SHIFT / 8);
+    static const int SK_NATIVE_G_IDX = 3 - (SK_G32_SHIFT / 8);
+    static const int SK_NATIVE_B_IDX = 3 - (SK_B32_SHIFT / 8);
+#endif
+
+/**
+ * convert_pixel<OUT_CFG, IN_CFG converts a pixel value from one Config8888 to
+ * another. It is implemented by first expanding OUT_CFG to r, g, b, a indices
+ * and an is_premul bool as params to another template function. Then IN_CFG is
+ * expanded via another function call.
+ */
+
+template <bool OUT_PM, int OUT_A_IDX, int OUT_R_IDX, int OUT_G_IDX, int OUT_B_IDX,
+          bool IN_PM,  int IN_A_IDX,  int IN_R_IDX,  int IN_G_IDX,  int IN_B_IDX>
+inline uint32_t convert_pixel(uint32_t pixel) {
+    uint32_t a, r, g, b;
+    unpack_config8888<IN_A_IDX, IN_R_IDX, IN_G_IDX, IN_B_IDX>(pixel, &a, &r, &g, &b);
+    if (IN_PM && !OUT_PM) {
+        // We're doing the explicit divide to match WebKit layout
+        // test expectations. We can modify and rebaseline if there
+        // it can be shown that there is a more performant way to
+        // unpremul.
+        if (a) {
+            r = r * 0xff / a;
+            g = g * 0xff / a;
+            b = b * 0xff / a;
+        } else {
+            return 0;
+        }
+    } else if (!IN_PM && OUT_PM) {
+        // This matches WebKit's conversion which we are replacing.
+        // We can consider alternative rounding rules for performance.
+        r = SkMulDiv255Ceiling(r, a);
+        g = SkMulDiv255Ceiling(g, a);
+        b = SkMulDiv255Ceiling(b, a);
+    }
+    return pack_config8888<OUT_A_IDX, OUT_R_IDX, OUT_G_IDX, OUT_B_IDX>(a, r, g, b);
+}
+
+template <bool OUT_PM, int OUT_A_IDX, int OUT_R_IDX, int OUT_G_IDX, int OUT_B_IDX, SkCanvas::Config8888 IN_CFG>
+inline uint32_t convert_pixel(uint32_t pixel) {
+    switch(IN_CFG) {
+        case SkCanvas::kNative_Premul_Config8888:
+            return convert_pixel<OUT_PM, OUT_A_IDX,       OUT_R_IDX,       OUT_G_IDX,       OUT_B_IDX,
+                                 true,  SK_NATIVE_A_IDX,  SK_NATIVE_R_IDX, SK_NATIVE_G_IDX, SK_NATIVE_B_IDX>(pixel);
+            break;
+        case SkCanvas::kNative_Unpremul_Config8888:
+            return convert_pixel<OUT_PM, OUT_A_IDX,       OUT_R_IDX,       OUT_G_IDX,       OUT_B_IDX,
+                                 false,  SK_NATIVE_A_IDX, SK_NATIVE_R_IDX, SK_NATIVE_G_IDX, SK_NATIVE_B_IDX>(pixel);
+            break;
+        case SkCanvas::kBGRA_Premul_Config8888:
+            return convert_pixel<OUT_PM, OUT_A_IDX, OUT_R_IDX, OUT_G_IDX, OUT_B_IDX,
+                                 true,  3,         2,         1,         0>(pixel);
+            break;
+        case SkCanvas::kBGRA_Unpremul_Config8888:
+            return convert_pixel<OUT_PM, OUT_A_IDX, OUT_R_IDX, OUT_G_IDX, OUT_B_IDX,
+                                 false,  3,         2,         1,         0>(pixel);
+            break;
+        case SkCanvas::kRGBA_Premul_Config8888:
+            return convert_pixel<OUT_PM, OUT_A_IDX, OUT_R_IDX, OUT_G_IDX, OUT_B_IDX,
+                                 true,  3,         0,         1,         2>(pixel);
+            break;
+        case SkCanvas::kRGBA_Unpremul_Config8888:
+            return convert_pixel<OUT_PM, OUT_A_IDX, OUT_R_IDX, OUT_G_IDX, OUT_B_IDX,
+                                 false,  3,         0,         1,         2>(pixel);
+            break;
+        default:
+            SkDEBUGFAIL("Unexpected config8888");
+            return 0;
+            break;
+    }
+}
+
+template <SkCanvas::Config8888 OUT_CFG, SkCanvas::Config8888 IN_CFG>
+inline uint32_t convert_pixel(uint32_t pixel) {
+    switch(OUT_CFG) {
+        case SkCanvas::kNative_Premul_Config8888:
+            return convert_pixel<true,  SK_NATIVE_A_IDX,  SK_NATIVE_R_IDX, SK_NATIVE_G_IDX, SK_NATIVE_B_IDX, IN_CFG>(pixel);
+            break;
+        case SkCanvas::kNative_Unpremul_Config8888:
+            return convert_pixel<false,  SK_NATIVE_A_IDX,  SK_NATIVE_R_IDX, SK_NATIVE_G_IDX, SK_NATIVE_B_IDX, IN_CFG>(pixel);
+            break;
+        case SkCanvas::kBGRA_Premul_Config8888:
+            return convert_pixel<true, 3, 2, 1, 0, IN_CFG>(pixel);
+            break;
+        case SkCanvas::kBGRA_Unpremul_Config8888:
+            return convert_pixel<false, 3, 2, 1, 0, IN_CFG>(pixel);
+            break;
+        case SkCanvas::kRGBA_Premul_Config8888:
+            return convert_pixel<true, 3, 0, 1, 2, IN_CFG>(pixel);
+            break;
+        case SkCanvas::kRGBA_Unpremul_Config8888:
+            return convert_pixel<false, 3, 0, 1, 2, IN_CFG>(pixel);
+            break;
+        default:
+            SkDEBUGFAIL("Unexpected config8888");
+            return 0;
+            break;
+    }
+}
+
+/**
+ * SkConvertConfig8888Pixels has 6 * 6 possible combinations of src and dst
+ * configs. Each is implemented as an instantiation templated function. Two
+ * levels of switch statements are used to select the correct instantiation, one
+ * for the src config and one for the dst config.
+ */
+
+template <SkCanvas::Config8888 DST_CFG, SkCanvas::Config8888 SRC_CFG>
+inline void convert_config8888(uint32_t* dstPixels,
+                               size_t dstRowBytes,
+                               const uint32_t* srcPixels,
+                               size_t srcRowBytes,
+                               int width,
+                               int height) {
+    intptr_t dstPix = reinterpret_cast<intptr_t>(dstPixels);
+    intptr_t srcPix = reinterpret_cast<intptr_t>(srcPixels);
+
+    for (int y = 0; y < height; ++y) {
+        srcPixels = reinterpret_cast<const uint32_t*>(srcPix);
+        dstPixels = reinterpret_cast<uint32_t*>(dstPix);
+        for (int x = 0; x < width; ++x) {
+            dstPixels[x] = convert_pixel<DST_CFG, SRC_CFG>(srcPixels[x]);
+        }
+        dstPix += dstRowBytes;
+        srcPix += srcRowBytes;
+    }
+}
+
+template <SkCanvas::Config8888 SRC_CFG>
+inline void convert_config8888(uint32_t* dstPixels,
+                               size_t dstRowBytes,
+                               SkCanvas::Config8888 dstConfig,
+                               const uint32_t* srcPixels,
+                               size_t srcRowBytes,
+                               int width,
+                               int height) {
+    switch(dstConfig) {
+        case SkCanvas::kNative_Premul_Config8888:
+            convert_config8888<SkCanvas::kNative_Premul_Config8888, SRC_CFG>(dstPixels, dstRowBytes, srcPixels, srcRowBytes, width, height);
+            break;
+        case SkCanvas::kNative_Unpremul_Config8888:
+            convert_config8888<SkCanvas::kNative_Unpremul_Config8888, SRC_CFG>(dstPixels, dstRowBytes, srcPixels, srcRowBytes, width, height);
+            break;
+        case SkCanvas::kBGRA_Premul_Config8888:
+            convert_config8888<SkCanvas::kBGRA_Premul_Config8888, SRC_CFG>(dstPixels, dstRowBytes, srcPixels, srcRowBytes, width, height);
+            break;
+        case SkCanvas::kBGRA_Unpremul_Config8888:
+            convert_config8888<SkCanvas::kBGRA_Unpremul_Config8888, SRC_CFG>(dstPixels, dstRowBytes, srcPixels, srcRowBytes, width, height);
+            break;
+        case SkCanvas::kRGBA_Premul_Config8888:
+            convert_config8888<SkCanvas::kRGBA_Premul_Config8888, SRC_CFG>(dstPixels, dstRowBytes, srcPixels, srcRowBytes, width, height);
+            break;
+        case SkCanvas::kRGBA_Unpremul_Config8888:
+            convert_config8888<SkCanvas::kRGBA_Unpremul_Config8888, SRC_CFG>(dstPixels, dstRowBytes, srcPixels, srcRowBytes, width, height);
+            break;
+        default:
+            SkDEBUGFAIL("Unexpected config8888");
+            break;
+    }
+}
+
+}
+
+void SkConvertConfig8888Pixels(uint32_t* dstPixels,
+                               size_t dstRowBytes,
+                               SkCanvas::Config8888 dstConfig,
+                               const uint32_t* srcPixels,
+                               size_t srcRowBytes,
+                               SkCanvas::Config8888 srcConfig,
+                               int width,
+                               int height) {
+    if (srcConfig == dstConfig) {
+        if (srcPixels == dstPixels) {
+            return;
+        }
+        if (dstRowBytes == srcRowBytes &&
+            4U * width == srcRowBytes) {
+            memcpy(dstPixels, srcPixels, srcRowBytes * height);
+            return;
+        } else {
+            intptr_t srcPix = reinterpret_cast<intptr_t>(srcPixels);
+            intptr_t dstPix = reinterpret_cast<intptr_t>(dstPixels);
+            for (int y = 0; y < height; ++y) {
+                srcPixels = reinterpret_cast<const uint32_t*>(srcPix);
+                dstPixels = reinterpret_cast<uint32_t*>(dstPix);
+                memcpy(dstPixels, srcPixels, 4 * width);
+                srcPix += srcRowBytes;
+                dstPix += dstRowBytes;
+            }
+            return;
+        }
+    }
+    switch(srcConfig) {
+        case SkCanvas::kNative_Premul_Config8888:
+            convert_config8888<SkCanvas::kNative_Premul_Config8888>(dstPixels, dstRowBytes, dstConfig, srcPixels, srcRowBytes, width, height);
+            break;
+        case SkCanvas::kNative_Unpremul_Config8888:
+            convert_config8888<SkCanvas::kNative_Unpremul_Config8888>(dstPixels, dstRowBytes, dstConfig, srcPixels, srcRowBytes, width, height);
+            break;
+        case SkCanvas::kBGRA_Premul_Config8888:
+            convert_config8888<SkCanvas::kBGRA_Premul_Config8888>(dstPixels, dstRowBytes, dstConfig, srcPixels, srcRowBytes, width, height);
+            break;
+        case SkCanvas::kBGRA_Unpremul_Config8888:
+            convert_config8888<SkCanvas::kBGRA_Unpremul_Config8888>(dstPixels, dstRowBytes, dstConfig, srcPixels, srcRowBytes, width, height);
+            break;
+        case SkCanvas::kRGBA_Premul_Config8888:
+            convert_config8888<SkCanvas::kRGBA_Premul_Config8888>(dstPixels, dstRowBytes, dstConfig, srcPixels, srcRowBytes, width, height);
+            break;
+        case SkCanvas::kRGBA_Unpremul_Config8888:
+            convert_config8888<SkCanvas::kRGBA_Unpremul_Config8888>(dstPixels, dstRowBytes, dstConfig, srcPixels, srcRowBytes, width, height);
+            break;
+        default:
+            SkDEBUGFAIL("Unexpected config8888");
+            break;
+    }
+}
+
+uint32_t SkPackConfig8888(SkCanvas::Config8888 config,
+                          uint32_t a,
+                          uint32_t r,
+                          uint32_t g,
+                          uint32_t b) {
+    switch (config) {
+        case SkCanvas::kNative_Premul_Config8888:
+        case SkCanvas::kNative_Unpremul_Config8888:
+            return pack_config8888<SK_NATIVE_A_IDX,
+                                   SK_NATIVE_R_IDX,
+                                   SK_NATIVE_G_IDX,
+                                   SK_NATIVE_B_IDX>(a, r, g, b);
+        case SkCanvas::kBGRA_Premul_Config8888:
+        case SkCanvas::kBGRA_Unpremul_Config8888:
+            return pack_config8888<3, 2, 1, 0>(a, r, g, b);
+        case SkCanvas::kRGBA_Premul_Config8888:
+        case SkCanvas::kRGBA_Unpremul_Config8888:
+            return pack_config8888<3, 0, 1, 2>(a, r, g, b);
+        default:
+            SkDEBUGFAIL("Unexpected config8888");
+            return 0;
+    }
+}
diff --git a/src/core/SkConfig8888.h b/src/core/SkConfig8888.h
index fe2f2cc..a891370 100644
--- a/src/core/SkConfig8888.h
+++ b/src/core/SkConfig8888.h
@@ -10,6 +10,27 @@
 #include "SkCanvas.h"
 #include "SkColorPriv.h"
 
+/**
+ * Converts pixels from one Config8888 to another Config8888
+ */
+void SkConvertConfig8888Pixels(uint32_t* dstPixels,
+                               size_t dstRowBytes,
+                               SkCanvas::Config8888 dstConfig,
+                               const uint32_t* srcPixels,
+                               size_t srcRowBytes,
+                               SkCanvas::Config8888 srcConfig,
+                               int width,
+                               int height);
+
+/**
+ * Packs a, r, g, b, values into byte order specified by config.
+ */
+uint32_t SkPackConfig8888(SkCanvas::Config8888 config,
+                          uint32_t a,
+                          uint32_t r,
+                          uint32_t g,
+                          uint32_t b);
+
 namespace {
 
 /**
@@ -22,14 +43,6 @@
                                      const SkBitmap& srcBmp);
 
 /**
- * Copies all pixels in a bitmap to a dst ptr with row bytes. The src bitmap
- * is assumed to have pixels and be kARGB_8888_Config. No conversion is applied
- */
-inline void SkCopyARGB8888BitmapTo(uint32_t* dstPixels,
-                                   size_t dstRowBytes,
-                                   const SkBitmap& srcBmp);
-
-/**
   Copies over all pixels in a bitmap from a src ptr with a given rowBytes and
   Config8888. The bitmap must have pixels and be kARGB_8888_Config.
  */
@@ -45,243 +58,32 @@
 
 namespace {
 
-template <int A_IDX, int R_IDX, int G_IDX, int B_IDX>
-inline uint32_t pack_config8888(uint32_t a, uint32_t r,
-                                uint32_t g, uint32_t b) {
-#ifdef SK_CPU_LENDIAN
-    return (a << (A_IDX * 8)) | (r << (R_IDX * 8)) |
-           (g << (G_IDX * 8)) | (b << (B_IDX * 8));
-#else
-    return (a << ((3-A_IDX) * 8)) | (r << ((3-R_IDX) * 8)) |
-           (g << ((3-G_IDX) * 8)) | (b << ((3-B_IDX) * 8));
-#endif
-}
-
-template <int A_IDX, int R_IDX, int G_IDX, int B_IDX>
-inline void unpack_config8888(uint32_t color,
-                              uint32_t* a, uint32_t* r,
-                              uint32_t* g, uint32_t* b) {
-#ifdef SK_CPU_LENDIAN
-    *a = (color >> (A_IDX * 8)) & 0xff;
-    *r = (color >> (R_IDX * 8)) & 0xff;
-    *g = (color >> (G_IDX * 8)) & 0xff;
-    *b = (color >> (B_IDX * 8)) & 0xff;
-#else
-    *a = (color >> ((3 - A_IDX) * 8)) & 0xff;
-    *r = (color >> ((3 - R_IDX) * 8)) & 0xff;
-    *g = (color >> ((3 - G_IDX) * 8)) & 0xff;
-    *b = (color >> ((3 - B_IDX) * 8)) & 0xff;
-#endif
-}
-
-template <bool UNPM, int A_IDX, int R_IDX, int G_IDX, int B_IDX>
-inline void bitmap_copy_to_config8888(uint32_t* dstPixels,
-                                      size_t dstRowBytes,
-                                      const SkBitmap& srcBmp) {
+inline void SkCopyBitmapToConfig8888(uint32_t* dstPixels,
+                                     size_t dstRowBytes,
+                                     SkCanvas::Config8888 dstConfig8888,
+                                     const SkBitmap& srcBmp) {
     SkASSERT(SkBitmap::kARGB_8888_Config == srcBmp.config());
     SkAutoLockPixels alp(srcBmp);
     int w = srcBmp.width();
     int h = srcBmp.height();
     size_t srcRowBytes = srcBmp.rowBytes();
+    const uint32_t* srcPixels = reinterpret_cast<uint32_t*>(srcBmp.getPixels());
 
-    intptr_t src = reinterpret_cast<intptr_t>(srcBmp.getPixels());
-    intptr_t dst = reinterpret_cast<intptr_t>(dstPixels);
-
-    for (int y = 0; y < h; ++y) {
-        const SkPMColor* srcRow = reinterpret_cast<SkPMColor*>(src);
-        uint32_t* dstRow  = reinterpret_cast<uint32_t*>(dst);
-        for (int x = 0; x < w; ++x) {
-            SkPMColor pmcolor = srcRow[x];
-            if (UNPM) {
-                U8CPU a, r, g, b;
-                a = SkGetPackedA32(pmcolor);
-                if (a) {
-                    // We're doing the explicit divide to match WebKit layout
-                    // test expectations. We can modify and rebaseline if there
-                    // it can be shown that there is a more performant way to
-                    // unpremul.
-                    r = SkGetPackedR32(pmcolor) * 0xff / a;
-                    g = SkGetPackedG32(pmcolor) * 0xff / a;
-                    b = SkGetPackedB32(pmcolor) * 0xff / a;
-                    dstRow[x] = pack_config8888<A_IDX, R_IDX,
-                                                G_IDX, B_IDX>(a, r, g, b);
-                } else {
-                    dstRow[x] = 0;
-                }
-            } else {
-                dstRow[x] = pack_config8888<A_IDX, R_IDX,
-                                            G_IDX, B_IDX>(
-                                                   SkGetPackedA32(pmcolor),
-                                                   SkGetPackedR32(pmcolor),
-                                                   SkGetPackedG32(pmcolor),
-                                                   SkGetPackedB32(pmcolor));
-            }
-        }
-        dst += dstRowBytes;
-        src += srcRowBytes;
-    }
-}
-
-template <bool PM, int A_IDX, int R_IDX, int G_IDX, int B_IDX>
-inline void config8888_copy_to_bitmap(const SkBitmap& dstBmp,
-                                      const uint32_t* srcPixels,
-                                      size_t srcRowBytes) {
-    SkASSERT(SkBitmap::kARGB_8888_Config == dstBmp.config());
-    SkAutoLockPixels alp(dstBmp);
-    int w = dstBmp.width();
-    int h = dstBmp.height();
-    size_t dstRowBytes = dstBmp.rowBytes();
-
-    intptr_t src = reinterpret_cast<intptr_t>(srcPixels);
-    intptr_t dst = reinterpret_cast<intptr_t>(dstBmp.getPixels());
-
-    for (int y = 0; y < h; ++y) {
-        const uint32_t* srcRow  = reinterpret_cast<uint32_t*>(src);
-        SkPMColor* dstRow = reinterpret_cast<SkPMColor*>(dst);
-        for (int x = 0; x < w; ++x) {
-            uint32_t c8888 = srcRow[x];
-            uint32_t a, r, g, b;
-            unpack_config8888<A_IDX, R_IDX, G_IDX, B_IDX>(c8888, &a, &r,
-                                                                 &g, &b);
-            if (PM) {
-                // This matches WebKit's conversion which we are replacing.
-                // We can consider alternative rounding rules for performance.
-                r = SkMulDiv255Ceiling(r, a);
-                g = SkMulDiv255Ceiling(g, a);
-                b = SkMulDiv255Ceiling(b, a);
-            }
-            // NoCheck: https://bugs.webkit.org/show_bug.cgi?id=74025
-            dstRow[x] = SkPackARGB32NoCheck(a, r, g, b);
-        }
-        src += srcRowBytes;
-        dst += dstRowBytes;
-    }
-}
-
-#ifdef SK_CPU_LENDIAN
-    static const int SK_NATIVE_A_IDX = SK_A32_SHIFT / 8;
-    static const int SK_NATIVE_R_IDX = SK_R32_SHIFT / 8;
-    static const int SK_NATIVE_G_IDX = SK_G32_SHIFT / 8;
-    static const int SK_NATIVE_B_IDX = SK_B32_SHIFT / 8;
-#else
-    static const int SK_NATIVE_A_IDX = 3 - (SK_A32_SHIFT / 8);
-    static const int SK_NATIVE_R_IDX = 3 - (SK_R32_SHIFT / 8);
-    static const int SK_NATIVE_G_IDX = 3 - (SK_G32_SHIFT / 8);
-    static const int SK_NATIVE_B_IDX = 3 - (SK_B32_SHIFT / 8);
-#endif
-
-inline void SkCopyBitmapToConfig8888(uint32_t* dstPixels,
-                                     size_t dstRowBytes,
-                                     SkCanvas::Config8888 dstConfig8888,
-                                     const SkBitmap& srcBmp) {
-    switch (dstConfig8888) {
-        case SkCanvas::kNative_Premul_Config8888:
-            bitmap_copy_to_config8888<false,
-                                      SK_NATIVE_A_IDX, SK_NATIVE_R_IDX,
-                                      SK_NATIVE_G_IDX, SK_NATIVE_B_IDX>(
-                                            dstPixels,
-                                            dstRowBytes,
-                                            srcBmp);
-            break;
-        case SkCanvas::kNative_Unpremul_Config8888:
-            bitmap_copy_to_config8888<true,
-                                      SK_NATIVE_A_IDX, SK_NATIVE_R_IDX,
-                                      SK_NATIVE_G_IDX, SK_NATIVE_B_IDX>(
-                                            dstPixels,
-                                            dstRowBytes,
-                                            srcBmp);
-            break;
-        case SkCanvas::kBGRA_Premul_Config8888:
-            bitmap_copy_to_config8888<false, 3, 2, 1, 0> (
-                                    dstPixels, dstRowBytes, srcBmp);
-            break;
-        case SkCanvas::kBGRA_Unpremul_Config8888:
-            bitmap_copy_to_config8888<true, 3, 2, 1, 0> (
-                                    dstPixels, dstRowBytes, srcBmp);
-            break;
-        case SkCanvas::kRGBA_Premul_Config8888:
-            bitmap_copy_to_config8888<false, 3, 0, 1, 2> (
-                                    dstPixels, dstRowBytes, srcBmp);
-            break;
-        case SkCanvas::kRGBA_Unpremul_Config8888:
-            bitmap_copy_to_config8888<true, 3, 0, 1, 2> (
-                                    dstPixels, dstRowBytes, srcBmp);
-            break;
-        default:
-            SkDEBUGFAIL("unexpected Config8888");
-            break;
-    }
+    SkConvertConfig8888Pixels(dstPixels, dstRowBytes, dstConfig8888, srcPixels, srcRowBytes, SkCanvas::kNative_Premul_Config8888, w, h);
 }
 
 inline void SkCopyConfig8888ToBitmap(const SkBitmap& dstBmp,
                                      const uint32_t* srcPixels,
                                      size_t srcRowBytes,
                                      SkCanvas::Config8888 srcConfig8888) {
-    switch (srcConfig8888) {
-        case SkCanvas::kNative_Premul_Config8888:
-            config8888_copy_to_bitmap<false,
-                                      SK_NATIVE_A_IDX, SK_NATIVE_R_IDX,
-                                      SK_NATIVE_G_IDX, SK_NATIVE_B_IDX>(
-                                            dstBmp,
-                                            srcPixels,
-                                            srcRowBytes);
-            break;
-        case SkCanvas::kNative_Unpremul_Config8888:
-            config8888_copy_to_bitmap<true,
-                                      SK_NATIVE_A_IDX, SK_NATIVE_R_IDX,
-                                      SK_NATIVE_G_IDX, SK_NATIVE_B_IDX>(
-                                            dstBmp,
-                                            srcPixels,
-                                            srcRowBytes);
-            break;
-        case SkCanvas::kBGRA_Premul_Config8888:
-            config8888_copy_to_bitmap<false, 3, 2, 1, 0> (
-                                    dstBmp, srcPixels, srcRowBytes);
-            break;
-        case SkCanvas::kBGRA_Unpremul_Config8888:
-            config8888_copy_to_bitmap<true, 3, 2, 1, 0> (
-                                    dstBmp, srcPixels, srcRowBytes);
-            break;
-        case SkCanvas::kRGBA_Premul_Config8888:
-            config8888_copy_to_bitmap<false, 3, 0, 1, 2> (
-                                    dstBmp, srcPixels, srcRowBytes);
-            break;
-        case SkCanvas::kRGBA_Unpremul_Config8888:
-            config8888_copy_to_bitmap<true, 3, 0, 1, 2> (
-                                    dstBmp, srcPixels, srcRowBytes);
-            break;
-        default:
-            SkDEBUGFAIL("unexpected Config8888");
-            break;
-    }
-}
+    SkASSERT(SkBitmap::kARGB_8888_Config == dstBmp.config());
+    SkAutoLockPixels alp(dstBmp);
+    int w = dstBmp.width();
+    int h = dstBmp.height();
+    size_t dstRowBytes = dstBmp.rowBytes();
+    uint32_t* dstPixels = reinterpret_cast<uint32_t*>(dstBmp.getPixels());
 
-inline void SkCopyARGB8888BitmapTo(uint32_t* dstPixels,
-                                   size_t dstRowBytes,
-                                   const SkBitmap& srcBmp) {
-    SkASSERT(SkBitmap::kARGB_8888_Config == srcBmp.config());
-
-    SkAutoLockPixels alp(srcBmp);
-
-    int w = srcBmp.width();
-    int h = srcBmp.height();
-    size_t srcRowBytes = srcBmp.rowBytes();
-
-    size_t tightRowBytes = w * 4;
-
-    char* src = reinterpret_cast<char*>(srcBmp.getPixels());
-    char* dst = reinterpret_cast<char*>(dstPixels);
-
-    if (tightRowBytes == srcRowBytes &&
-        tightRowBytes == dstRowBytes) {
-        memcpy(dst, src, tightRowBytes * h);
-    } else {
-        for (int y = 0; y < h; ++y) {
-            memcpy(dst, src, tightRowBytes);
-            dst += dstRowBytes;
-            src += srcRowBytes;
-        }
-    }
+    SkConvertConfig8888Pixels(dstPixels, dstRowBytes, SkCanvas::kNative_Premul_Config8888, srcPixels, srcRowBytes, srcConfig8888, w, h);
 }
 
 }
diff --git a/src/core/SkDevice.cpp b/src/core/SkDevice.cpp
index aaafb14..f1da2ef 100644
--- a/src/core/SkDevice.cpp
+++ b/src/core/SkDevice.cpp
@@ -34,7 +34,7 @@
     delete fMetaData;
 }
 
-SkDevice* SkDevice::createCompatibleDevice(SkBitmap::Config config, 
+SkDevice* SkDevice::createCompatibleDevice(SkBitmap::Config config,
                                            int width, int height,
                                            bool isOpaque) {
     return this->onCreateCompatibleDevice(config, width, height,
@@ -48,8 +48,8 @@
                                           isOpaque, kSaveLayer_Usage);
 }
 
-SkDevice* SkDevice::onCreateCompatibleDevice(SkBitmap::Config config, 
-                                             int width, int height, 
+SkDevice* SkDevice::onCreateCompatibleDevice(SkBitmap::Config config,
+                                             int width, int height,
                                              bool isOpaque,
                                              Usage usage) {
     return SkNEW_ARGS(SkDevice,(config, width, height, isOpaque));
@@ -77,11 +77,11 @@
 }
 
 const SkBitmap& SkDevice::accessBitmap(bool changePixels) {
-    this->onAccessBitmap(&fBitmap);
+    const SkBitmap& bitmap = this->onAccessBitmap(&fBitmap);
     if (changePixels) {
-        fBitmap.notifyPixelsChanged();
+        bitmap.notifyPixelsChanged();
     }
-    return fBitmap;
+    return bitmap;
 }
 
 void SkDevice::getGlobalBounds(SkIRect* bounds) const {
@@ -95,7 +95,7 @@
     fBitmap.eraseColor(color);
 }
 
-void SkDevice::onAccessBitmap(SkBitmap* bitmap) {}
+const SkBitmap& SkDevice::onAccessBitmap(SkBitmap* bitmap) {return *bitmap;}
 
 void SkDevice::setMatrixClip(const SkMatrix& matrix, const SkRegion& region,
                              const SkClipStack& clipStack) {
@@ -204,19 +204,11 @@
     }
     if (SkBitmap::kARGB_8888_Config != subset.config()) {
         // It'd be preferable to do this directly to bitmap.
-        subset.copyTo(&subset, SkBitmap::kARGB_8888_Config); 
+        subset.copyTo(&subset, SkBitmap::kARGB_8888_Config);
     }
     SkAutoLockPixels alp(bitmap);
     uint32_t* bmpPixels = reinterpret_cast<uint32_t*>(bitmap.getPixels());
-    if ((SkCanvas::kNative_Premul_Config8888 == config8888 ||
-         kPMColorAlias == config8888)) {
-        SkCopyARGB8888BitmapTo(bmpPixels, bitmap.rowBytes(), subset);
-    } else {
-        SkCopyBitmapToConfig8888(bmpPixels,
-                                 bitmap.rowBytes(),
-                                 config8888,
-                                 subset);
-    }
+    SkCopyBitmapToConfig8888(bmpPixels, bitmap.rowBytes(), config8888, subset);
     return true;
 }
 
diff --git a/src/core/SkDeviceProfile.cpp b/src/core/SkDeviceProfile.cpp
new file mode 100644
index 0000000..2c2cb88
--- /dev/null
+++ b/src/core/SkDeviceProfile.cpp
@@ -0,0 +1,72 @@
+
+
+#include "SkDeviceProfile.h"
+
+#define DEFAULT_GAMMAEXP        2.2
+#define DEFAULT_CONTRASTSCALE   0.5
+#define DEFAULT_LCDCONFIG       SkDeviceProfile::kNone_LCDConfig
+#define DEFAULT_FONTHINTLEVEL   SkDeviceProfile::kSlight_FontHintLevel
+
+static float pin(float value, float min, float max) {
+    if (value < min) {
+        value = min;
+    } else if (value > max) {
+        value = max;
+    }
+    return value;
+}
+
+SkDeviceProfile::SkDeviceProfile(float gammaExp, float contrast,
+                                 LCDConfig config, FontHintLevel level) {
+    fGammaExponent = pin(gammaExp, 0, 10);
+    fContrastScale = pin(contrast, 0, 1);
+    fLCDConfig = config;
+    fFontHintLevel = level;
+}
+
+void SkDeviceProfile::generateTableForLuminanceByte(U8CPU lumByte,
+                                                    uint8_t table[256]) const {
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+SkDeviceProfile* SkDeviceProfile::Create(float gammaExp,
+                                         float contrast,
+                                         LCDConfig config,
+                                         FontHintLevel level) {
+    return SkNEW_ARGS(SkDeviceProfile, (gammaExp, contrast, config, level));
+}
+
+static SkMutex gMutex;
+static SkDeviceProfile* gDefaultProfile;
+static SkDeviceProfile* gGlobalProfile;
+
+SkDeviceProfile* SkDeviceProfile::GetDefault() {
+    SkAutoMutexAcquire amc(gMutex);
+
+    if (NULL == gDefaultProfile) {
+        gDefaultProfile = SkDeviceProfile::Create(DEFAULT_GAMMAEXP,
+                                                  DEFAULT_CONTRASTSCALE,
+                                                  DEFAULT_LCDCONFIG,
+                                                  DEFAULT_FONTHINTLEVEL);
+    }
+    return gDefaultProfile;
+}
+
+SkDeviceProfile* SkDeviceProfile::RefGlobal() {
+    SkAutoMutexAcquire amc(gMutex);
+
+    if (NULL == gGlobalProfile) {
+        gGlobalProfile = SkDeviceProfile::GetDefault();
+    }
+    gGlobalProfile->ref();
+    return gGlobalProfile;
+}
+    
+void SkDeviceProfile::SetGlobal(SkDeviceProfile* profile) {
+    SkAutoMutexAcquire amc(gMutex);
+
+    SkRefCnt_SafeAssign(gGlobalProfile, profile);
+}
+
+
diff --git a/src/core/SkDraw.cpp b/src/core/SkDraw.cpp
index c6fd406..23a6d59 100644
--- a/src/core/SkDraw.cpp
+++ b/src/core/SkDraw.cpp
@@ -13,6 +13,7 @@
 #include "SkCanvas.h"
 #include "SkColorPriv.h"
 #include "SkDevice.h"
+#include "SkFixed.h"
 #include "SkMaskFilter.h"
 #include "SkPaint.h"
 #include "SkPathEffect.h"
@@ -859,14 +860,14 @@
 }
 
 bool SkDrawTreatAsHairline(const SkPaint& paint, const SkMatrix& matrix,
-                           SkAlpha* newAlpha) {
-    SkASSERT(newAlpha);
+                           SkScalar* coverage) {
+    SkASSERT(coverage);
     if (SkPaint::kStroke_Style != paint.getStyle()) {
         return false;
     }
     SkScalar strokeWidth = paint.getStrokeWidth();
     if (0 == strokeWidth) {
-        *newAlpha = paint.getAlpha();
+        *coverage = SK_Scalar1;
         return true;
     }
 
@@ -876,9 +877,6 @@
     if (!paint.isAntiAlias()) {
         return false;
     }
-    if (!xfermodeSupportsCoverageAsAlpha(paint.getXfermode())) {
-        return false;
-    }
     if (matrix.hasPerspective()) {
         return false;
     }
@@ -890,16 +888,7 @@
     SkScalar len0 = fast_len(dst[0]);
     SkScalar len1 = fast_len(dst[1]);
     if (len0 <= SK_Scalar1 && len1 <= SK_Scalar1) {
-        SkScalar modulate = SkScalarAve(len0, len1);
-#if 0
-        *newAlpha = SkToU8(SkScalarRoundToInt(modulate * paint.getAlpha()));
-#else
-        // this is the old technique, which we preserve for now so we don't
-        // change previous results (testing)
-        // the new way seems fine, its just (a tiny bit) different
-        int scale = (int)SkScalarMul(modulate, 256);
-        *newAlpha = paint.getAlpha() * scale >> 8;
-#endif
+        *coverage = SkScalarAve(len0, len1);
         return true;
     }
     return false;
@@ -946,12 +935,29 @@
     SkTLazy<SkPaint> lazyPaint;
 
     {
-        SkAlpha newAlpha;
-        if (SkDrawTreatAsHairline(origPaint, *matrix, &newAlpha)) {
-            lazyPaint.set(origPaint);
-            lazyPaint.get()->setAlpha(newAlpha);
-            lazyPaint.get()->setStrokeWidth(0);
-            paint = lazyPaint.get();
+        SkScalar coverage;
+        if (SkDrawTreatAsHairline(origPaint, *matrix, &coverage)) {
+            if (SK_Scalar1 == coverage) {
+                lazyPaint.set(origPaint);
+                lazyPaint.get()->setStrokeWidth(0);
+                paint = lazyPaint.get();
+            } else if (xfermodeSupportsCoverageAsAlpha(origPaint.getXfermode())) {
+                U8CPU newAlpha;
+#if 0
+                newAlpha = SkToU8(SkScalarRoundToInt(coverage *
+                                                     origPaint.getAlpha()));
+#else
+                // this is the old technique, which we preserve for now so
+                // we don't change previous results (testing)
+                // the new way seems fine, its just (a tiny bit) different
+                int scale = (int)SkScalarMul(coverage, 256);
+                newAlpha = origPaint.getAlpha() * scale >> 8;
+#endif
+                lazyPaint.set(origPaint);
+                lazyPaint.get()->setStrokeWidth(0);
+                lazyPaint.get()->setAlpha(newAlpha);
+                paint = lazyPaint.get();
+            }
         }
     }
 
@@ -1477,9 +1483,9 @@
 SkDraw1Glyph::Proc SkDraw1Glyph::init(const SkDraw* draw, SkBlitter* blitter,
                                       SkGlyphCache* cache) {
     fDraw = draw;
-	fBounder = draw->fBounder;
-	fBlitter = blitter;
-	fCache = cache;
+    fBounder = draw->fBounder;
+    fBlitter = blitter;
+    fCache = cache;
 
     if (hasCustomD1GProc(*draw)) {
         // todo: fix this assumption about clips w/ custom
@@ -1577,17 +1583,21 @@
 
     SkFixed fxMask = ~0;
     SkFixed fyMask = ~0;
-    if (paint.isSubpixelText()) {
+    if (cache->isSubpixel()) {
         SkAxisAlignment baseline = SkComputeAxisAlignmentForHText(*matrix);
         if (kX_SkAxisAlignment == baseline) {
             fyMask = 0;
         } else if (kY_SkAxisAlignment == baseline) {
             fxMask = 0;
         }
+    
+    // apply bias here to avoid adding 1/2 the sampling frequency in the loop
+        fx += SK_FixedHalf >> SkGlyph::kSubBits;
+        fy += SK_FixedHalf >> SkGlyph::kSubBits;
+    } else {
+        fx += SK_FixedHalf;
+        fy += SK_FixedHalf;
     }
-    // apply the bias here, so we don't have to add 1/2 in the loop
-    fx += SK_FixedHalf;
-    fy += SK_FixedHalf;
 
     SkAAClipBlitter     aaBlitter;
     SkAutoBlitterChoose blitterChooser;
@@ -1606,7 +1616,7 @@
     SkDraw1Glyph::Proc  proc = d1g.init(this, blitter, cache);
 
     while (text < stop) {
-        const SkGlyph& glyph  = glyphCacheProc(cache, &text, fx & fxMask, fy & fyMask);
+        const SkGlyph& glyph = glyphCacheProc(cache, &text, fx & fxMask, fy & fyMask);
 
         fx += autokern.adjust(glyph);
 
@@ -1757,12 +1767,12 @@
     
     const char*        stop = text + byteLength;
     AlignProc          alignProc = pick_align_proc(paint.getTextAlign());
-	SkDraw1Glyph	   d1g;
-	SkDraw1Glyph::Proc  proc = d1g.init(this, blitter, cache);
+    SkDraw1Glyph       d1g;
+    SkDraw1Glyph::Proc proc = d1g.init(this, blitter, cache);
     TextMapState       tms(*matrix, constY);
     TextMapState::Proc tmsProc = tms.pickProc(scalarsPerPosition);
 
-    if (paint.isSubpixelText()) {
+    if (cache->isSubpixel()) {
         // maybe we should skip the rounding if linearText is set
         SkAxisAlignment roundBaseline = SkComputeAxisAlignmentForHText(*matrix);
 
@@ -1771,8 +1781,13 @@
 
                 tmsProc(tms, pos);
 
+#ifdef SK_DRAW_POS_TEXT_IGNORE_SUBPIXEL_LEFT_ALIGN_FIX
                 SkFixed fx = SkScalarToFixed(tms.fLoc.fX);
                 SkFixed fy = SkScalarToFixed(tms.fLoc.fY);
+#else
+                SkFixed fx = SkScalarToFixed(tms.fLoc.fX) + (SK_FixedHalf >> SkGlyph::kSubBits);
+                SkFixed fy = SkScalarToFixed(tms.fLoc.fY) + (SK_FixedHalf >> SkGlyph::kSubBits);
+#endif
                 SkFixed fxMask = ~0;
                 SkFixed fyMask = ~0;
 
@@ -1807,8 +1822,8 @@
                     {
                         SkIPoint fixedLoc;
                         alignProc(tms.fLoc, *glyph, &fixedLoc);
-                        fx = fixedLoc.fX;
-                        fy = fixedLoc.fY;
+                        fx = fixedLoc.fX + (SK_FixedHalf >> SkGlyph::kSubBits);
+                        fy = fixedLoc.fY + (SK_FixedHalf >> SkGlyph::kSubBits);
 
                         if (kX_SkAxisAlignment == roundBaseline) {
                             fyMask = 0;
@@ -1840,8 +1855,10 @@
                 SkIPoint fixedLoc;
                 alignProc(tms.fLoc, glyph, &fixedLoc);
 
-                proc(d1g, fixedLoc.fX + SK_FixedHalf,
-                     fixedLoc.fY + SK_FixedHalf, glyph);
+                proc(d1g,
+                     fixedLoc.fX + SK_FixedHalf,
+                     fixedLoc.fY + SK_FixedHalf,
+                     glyph);
             }
             pos += scalarsPerPosition;
         }
diff --git a/src/core/SkDrawProcs.h b/src/core/SkDrawProcs.h
index 74aa9bb..2e26ecf 100644
--- a/src/core/SkDrawProcs.h
+++ b/src/core/SkDrawProcs.h
@@ -14,19 +14,20 @@
 class SkBlitter;
 
 struct SkDraw1Glyph {
-    const SkDraw*   fDraw;
-	SkBounder*		fBounder;
-	const SkRegion*	fClip;
-	const SkAAClip*	fAAClip;
-	SkBlitter*		fBlitter;
-	SkGlyphCache*	fCache;
-	SkIRect			fClipBounds;
-	
-    // The fixed x,y have been pre-rounded (i.e. 1/2 has already been added),
-    // so the impls need just trunc down to an int
-	typedef void (*Proc)(const SkDraw1Glyph&, SkFixed x, SkFixed y, const SkGlyph&);
-	
-	Proc init(const SkDraw* draw, SkBlitter* blitter, SkGlyphCache* cache);
+    const SkDraw* fDraw;
+    SkBounder* fBounder;
+    const SkRegion* fClip;
+    const SkAAClip* fAAClip;
+    SkBlitter* fBlitter;
+    SkGlyphCache* fCache;
+    SkIRect fClipBounds;
+
+    // The fixed x,y are pre-rounded, so impls just trunc them down to ints.
+    // i.e. half the sampling frequency has been added.
+    // e.g. 1/2 or 1/(2^(SkGlyph::kSubBits+1)) has already been added.
+    typedef void (*Proc)(const SkDraw1Glyph&, SkFixed x, SkFixed y, const SkGlyph&);
+    
+    Proc init(const SkDraw* draw, SkBlitter* blitter, SkGlyphCache* cache);
 };
 
 struct SkDrawProcs {
@@ -34,13 +35,12 @@
 };
 
 /**
- *  If the current paint is set to stroke, has a compatible xfermode, and the
- *  stroke-width when applied to the matrix is <= 1.0, then this returns true,
- *  and sets newAlpha (simulating a stroke by drawing a hairline + newAlpha).
- *  If any of these conditions are false, then this returns false and modulate
- *  is ignored.
+ *  If the current paint is set to stroke and the stroke-width when applied to 
+ *  the matrix is <= 1.0, then this returns true, and sets coverage (simulating
+ *  a stroke by drawing a hairline with partial coverage). If any of these 
+ *  conditions are false, then this returns false and coverage is ignored.
  */
-bool SkDrawTreatAsHairline(const SkPaint&, const SkMatrix&, SkAlpha* newAlpha);
+bool SkDrawTreatAsHairline(const SkPaint&, const SkMatrix&, SkScalar* coverage);
 
 #endif
 
diff --git a/src/core/SkFlattenable.cpp b/src/core/SkFlattenable.cpp
index 59a262a..131cf4f 100644
--- a/src/core/SkFlattenable.cpp
+++ b/src/core/SkFlattenable.cpp
@@ -58,6 +58,7 @@
     fFactoryTDArray = NULL;
     fFactoryArray = NULL;
     fFactoryCount = 0;
+    fPictureVersion = PICTURE_VERSION_JB;
 }
 
 SkFlattenableReadBuffer::SkFlattenableReadBuffer(const void* data) :
@@ -71,6 +72,7 @@
     fFactoryTDArray = NULL;
     fFactoryArray = NULL;
     fFactoryCount = 0;
+    fPictureVersion = PICTURE_VERSION_JB;
 }
 
 SkFlattenableReadBuffer::SkFlattenableReadBuffer(const void* data, size_t size)
@@ -84,6 +86,7 @@
     fFactoryTDArray = NULL;
     fFactoryArray = NULL;
     fFactoryCount = 0;
+    fPictureVersion = PICTURE_VERSION_JB;
 }
 
 SkTypeface* SkFlattenableReadBuffer::readTypeface() {
@@ -110,6 +113,43 @@
 }
 
 SkFlattenable* SkFlattenableReadBuffer::readFlattenable() {
+
+    if(fPictureVersion == PICTURE_VERSION_ICS) {
+        SkFlattenable::Factory factory = NULL;
+
+        if (fFactoryCount > 0) {
+            uint32_t index = this->readU32();
+            if (index > 0) {
+                index -= 1;
+                SkASSERT(index < (unsigned)fFactoryCount);
+                factory = fFactoryArray[index];
+                // if we recorded an index, but failed to get a factory, we need
+                // to skip the flattened data in the buffer
+                if (NULL == factory) {
+                    uint32_t size = this->readU32();
+                    this->skip(size);
+                    // fall through and return NULL for the object
+                }
+            }
+        } else {
+            factory = (SkFlattenable::Factory)readFunctionPtr();
+        }
+
+        SkFlattenable* obj = NULL;
+        if (factory) {
+            uint32_t sizeRecorded = this->readU32();
+            uint32_t offset = this->offset();
+            obj = (*factory)(*this);
+            // check that we read the amount we expected
+            uint32_t sizeRead = this->offset() - offset;
+            if (sizeRecorded != sizeRead) {
+                // we could try to fix up the offset...
+                sk_throw();
+            }
+        }
+        return obj;
+    }
+
     SkFlattenable::Factory factory = NULL;
 
     if (fFactoryCount > 0) {
diff --git a/src/core/SkGeometry.cpp b/src/core/SkGeometry.cpp
index 5308d56..de86827 100644
--- a/src/core/SkGeometry.cpp
+++ b/src/core/SkGeometry.cpp
@@ -834,6 +834,65 @@
 }
 #endif
 
+/**
+ *  Given an array and count, remove all pair-wise duplicates from the array,
+ *  keeping the existing sorting, and return the new count
+ */
+static int collaps_duplicates(float array[], int count) {
+    int n = count;
+    for (int n = count; n > 1; --n) {
+        if (array[0] == array[1]) {
+            for (int i = 1; i < n; ++i) {
+                array[i - 1] = array[i];
+            }
+            count -= 1;
+        } else {
+            array += 1;
+        }
+    }
+    return count;
+}
+
+#ifdef SK_DEBUG
+
+#define TEST_COLLAPS_ENTRY(array)   array, SK_ARRAY_COUNT(array)
+
+static void test_collaps_duplicates() {
+    static bool gOnce;
+    if (gOnce) { return; }
+    gOnce = true;
+    const float src0[] = { 0 };
+    const float src1[] = { 0, 0 };
+    const float src2[] = { 0, 1 };
+    const float src3[] = { 0, 0, 0 };
+    const float src4[] = { 0, 0, 1 };
+    const float src5[] = { 0, 1, 1 };
+    const float src6[] = { 0, 1, 2 };
+    const struct {
+        const float* fData;
+        int fCount;
+        int fCollapsedCount;
+    } data[] = {
+        { TEST_COLLAPS_ENTRY(src0), 1 },
+        { TEST_COLLAPS_ENTRY(src1), 1 },
+        { TEST_COLLAPS_ENTRY(src2), 2 },
+        { TEST_COLLAPS_ENTRY(src3), 1 },
+        { TEST_COLLAPS_ENTRY(src4), 2 },
+        { TEST_COLLAPS_ENTRY(src5), 2 },
+        { TEST_COLLAPS_ENTRY(src6), 3 },
+    };
+    for (size_t i = 0; i < SK_ARRAY_COUNT(data); ++i) {
+        float dst[3];
+        memcpy(dst, data[i].fData, data[i].fCount * sizeof(dst[0]));
+        int count = collaps_duplicates(dst, data[i].fCount);
+        SkASSERT(data[i].fCollapsedCount == count);
+        for (int j = 1; j < count; ++j) {
+            SkASSERT(dst[j-1] < dst[j]);
+        }
+    }
+}
+#endif
+
 #if defined _WIN32 && _MSC_VER >= 1300  && defined SK_SCALAR_IS_FIXED // disable warning : unreachable code if building fixed point for windows desktop
 #pragma warning ( disable : 4702 )
 #endif
@@ -841,6 +900,9 @@
 /*  Solve coeff(t) == 0, returning the number of roots that
     lie withing 0 < t < 1.
     coeff[0]t^3 + coeff[1]t^2 + coeff[2]t + coeff[3]
+ 
+    Eliminates repeated roots (so that all tValues are distinct, and are always
+    in increasing order.
 */
 static int solve_cubic_polynomial(const SkFP coeff[4], SkScalar tValues[3])
 {
@@ -895,8 +957,14 @@
         if (is_unit_interval(r))
             *roots++ = r;
 
+        SkDEBUGCODE(test_collaps_duplicates();)
+
         // now sort the roots
-        bubble_sort(tValues, (int)(roots - tValues));
+        int count = (int)(roots - tValues);
+        SkASSERT((unsigned)count <= 3);
+        bubble_sort(tValues, count);
+        count = collaps_duplicates(tValues, count);
+        roots = tValues + count;    // so we compute the proper count below
 #endif
     }
     else                // we have 1 real root
diff --git a/src/core/SkMaskFilter.cpp b/src/core/SkMaskFilter.cpp
index 36855a4..42d07a6 100644
--- a/src/core/SkMaskFilter.cpp
+++ b/src/core/SkMaskFilter.cpp
@@ -72,3 +72,4 @@
     }
 }
 
+
diff --git a/src/core/SkMemory_stdlib.cpp b/src/core/SkMemory_stdlib.cpp
index df87359..033b331 100644
--- a/src/core/SkMemory_stdlib.cpp
+++ b/src/core/SkMemory_stdlib.cpp
@@ -29,7 +29,7 @@
 #define kByteFill 0xCD
 #define kDeleteFill 0xEF
 
-static SkMutex& get_block_mutex() {
+static SkBaseMutex& get_block_mutex() {
     static SkMutex* gBlockMutex;
     if (NULL == gBlockMutex) {
         gBlockMutex = new SkMutex;
diff --git a/src/core/SkPaint.cpp b/src/core/SkPaint.cpp
index a6d30c9..b0b855a 100644
--- a/src/core/SkPaint.cpp
+++ b/src/core/SkPaint.cpp
@@ -23,15 +23,12 @@
 #include "SkXfermode.h"
 #include "SkAutoKern.h"
 #include "SkGlyphCache.h"
+#include "SkPaintDefaults.h"
 
 // define this to get a printf for out-of-range parameter in setters
 // e.g. setTextSize(-1)
 //#define SK_REPORT_API_RANGE_CHECK
 
-#define SK_DefaultTextSize      SkIntToScalar(12)
-
-#define SK_DefaultFlags         0   //(kNativeHintsText_Flag)
-
 #ifdef SK_BUILD_FOR_ANDROID
 #define GEN_ID_INC                  fGenerationID++
 #define GEN_ID_INC_EVAL(expression) if (expression) { fGenerationID++; }
@@ -60,17 +57,17 @@
     fWidth      = 0;
 #endif
 
-    fTextSize   = SK_DefaultTextSize;
+    fTextSize   = SkPaintDefaults_TextSize;
     fTextScaleX = SK_Scalar1;
     fColor      = SK_ColorBLACK;
-    fMiterLimit = SK_DefaultMiterLimit;
-    fFlags      = SK_DefaultFlags;
+    fMiterLimit = SkPaintDefaults_MiterLimit;
+    fFlags      = SkPaintDefaults_Flags;
     fCapType    = kDefault_Cap;
     fJoinType   = kDefault_Join;
     fTextAlign  = kLeft_Align;
     fStyle      = kFill_Style;
     fTextEncoding = kUTF8_TextEncoding;
-    fHinting    = kNormal_Hinting;
+    fHinting    = SkPaintDefaults_Hinting;
 #ifdef SK_BUILD_FOR_ANDROID
     fGenerationID = 0;
 #endif
@@ -1312,6 +1309,39 @@
     return true;
 }
 
+#ifdef SK_USE_COLOR_LUMINANCE
+static SkColor computeLuminanceColor(const SkPaint& paint) {
+    SkColor c;
+    if (!justAColor(paint, &c)) {
+        c = SkColorSetRGB(0x7F, 0x80, 0x7F);
+    }
+    return c;
+}
+
+#define assert_byte(x)  SkASSERT(0 == ((x) >> 8))
+
+static U8CPU reduce_lumbits(U8CPU x) {
+    static const uint8_t gReduceBits[] = {
+        0x0, 0x55, 0xAA, 0xFF
+    };
+    assert_byte(x);
+    return gReduceBits[x >> 6];
+}
+
+static unsigned computeLuminance(SkColor c) {
+    int r = SkColorGetR(c);
+    int g = SkColorGetG(c);
+    int b = SkColorGetB(c);
+    // compute luminance
+    // R=0.2126 G=0.7152 B=0.0722
+    // scaling by 127 yields 27, 92, 9
+    int luminance = r * 27 + g * 92 + b * 9;
+    luminance >>= 7;
+    assert_byte(luminance);
+    return luminance;
+}
+
+#else
 // returns 0..kLuminance_Max
 static unsigned computeLuminance(const SkPaint& paint) {
     SkColor c;
@@ -1335,6 +1365,7 @@
     // if we're not a single color, return the middle of the luminance range
     return SkScalerContext::kLuminance_Max >> 1;
 }
+#endif
 
 // Beyond this size, LCD doesn't appreciably improve quality, but it always
 // cost more RAM and draws slower, so we set a cap.
@@ -1458,11 +1489,18 @@
     if (paint.isVerticalText()) {
         flags |= SkScalerContext::kVertical_Flag;
     }
+    if (paint.getFlags() & SkPaint::kGenA8FromLCD_Flag) {
+        flags |= SkScalerContext::kGenA8FromLCD_Flag;
+    }
     rec->fFlags = SkToU16(flags);
 
     // these modify fFlags, so do them after assigning fFlags
     rec->setHinting(computeHinting(paint));
+#ifdef SK_USE_COLOR_LUMINANCE
+    rec->setLuminanceColor(computeLuminanceColor(paint));
+#else
     rec->setLuminanceBits(computeLuminance(paint));
+#endif
 
     /*  Allow the fonthost to modify our rec before we use it as a key into the
         cache. This way if we're asking for something that they will ignore,
@@ -1471,9 +1509,51 @@
      */
     SkFontHost::FilterRec(rec);
 
-    // No need to differentiate gamma if we're BW
-    if (SkMask::kBW_Format == rec->fMaskFormat) {
-        rec->setLuminanceBits(0);
+    // be sure to call PostMakeRec(rec) before you actually use it!
+}
+
+/**
+ *  We ensure that the rec is self-consistent and efficient (where possible)
+ */
+void SkScalerContext::PostMakeRec(SkScalerContext::Rec* rec) {
+
+    /**
+     *  If we're asking for A8, we force the colorlum to be gray, since that
+     *  that limits the number of unique entries, and the scaler will only
+     *  look at the lum of one of them.
+     */
+    switch (rec->fMaskFormat) {
+        case SkMask::kLCD16_Format:
+        case SkMask::kLCD32_Format: {
+#ifdef SK_USE_COLOR_LUMINANCE
+            // filter down the luminance color to a finite number of bits
+            SkColor c = rec->getLuminanceColor();
+            c = SkColorSetRGB(reduce_lumbits(SkColorGetR(c)),
+                              reduce_lumbits(SkColorGetG(c)),
+                              reduce_lumbits(SkColorGetB(c)));
+            rec->setLuminanceColor(c);
+#endif
+            break;
+        }
+        case SkMask::kA8_Format: {
+#ifdef SK_USE_COLOR_LUMINANCE
+            // filter down the luminance to a single component, since A8 can't
+            // use per-component information
+            unsigned lum = computeLuminance(rec->getLuminanceColor());
+            // reduce to our finite number of bits
+            lum = reduce_lumbits(lum);
+            rec->setLuminanceColor(SkColorSetRGB(lum, lum, lum));
+#endif
+            break;
+        }
+        case SkMask::kBW_Format:
+            // No need to differentiate gamma if we're BW
+#ifdef SK_USE_COLOR_LUMINANCE
+            rec->setLuminanceColor(0);
+#else
+            rec->setLuminanceBits(0);
+#endif
+            break;
     }
 }
 
@@ -1497,7 +1577,11 @@
 
     SkScalerContext::MakeRec(*this, deviceMatrix, &rec);
     if (ignoreGamma) {
+#ifdef SK_USE_COLOR_LUMINANCE
+        rec.setLuminanceColor(0);
+#else
         rec.setLuminanceBits(0);
+#endif
     }
 
     size_t          descSize = sizeof(rec);
@@ -1529,6 +1613,11 @@
         entryCount += 1;
         rec.fMaskFormat = SkMask::kA8_Format;   // force antialiasing when we do the scan conversion
     }
+
+    ///////////////////////////////////////////////////////////////////////////
+    // Now that we're done tweaking the rec, call the PostMakeRec cleanup
+    SkScalerContext::PostMakeRec(&rec);
+    
     descSize += SkDescriptor::ComputeOverhead(entryCount);
 
     SkAutoDescriptor    ad(descSize);
@@ -1722,11 +1811,16 @@
     uint32_t tmp = *pod++;
     this->setFlags(tmp >> 16);
 
-    // hinting added later. 0 in this nibble means use the default.
-    uint32_t hinting = (tmp >> 12) & 0xF;
-    this->setHinting(0 == hinting ? kNormal_Hinting : static_cast<Hinting>(hinting-1));
+    if (buffer.getPictureVersion() == PICTURE_VERSION_ICS) {
+        this->setTextAlign(static_cast<Align>((tmp >> 8) & 0xFF));
+        this->setHinting(SkPaintDefaults_Hinting);
+    } else {
+        // hinting added later. 0 in this nibble means use the default.
+        uint32_t hinting = (tmp >> 12) & 0xF;
+        this->setHinting(0 == hinting ? kNormal_Hinting : static_cast<Hinting>(hinting-1));
 
-    this->setTextAlign(static_cast<Align>((tmp >> 8) & 0xF));
+        this->setTextAlign(static_cast<Align>((tmp >> 8) & 0xF));
+    }
 
     uint8_t flatFlags = tmp & 0xFF;
 
@@ -1750,7 +1844,10 @@
         SkSafeUnref(this->setColorFilter((SkColorFilter*) buffer.readFlattenable()));
         SkSafeUnref(this->setRasterizer((SkRasterizer*) buffer.readFlattenable()));
         SkSafeUnref(this->setLooper((SkDrawLooper*) buffer.readFlattenable()));
-        SkSafeUnref(this->setImageFilter((SkImageFilter*) buffer.readFlattenable()));
+        if (buffer.getPictureVersion() != PICTURE_VERSION_ICS)
+            SkSafeUnref(this->setImageFilter((SkImageFilter*) buffer.readFlattenable()));
+        else
+            this->setImageFilter(NULL);
     } else {
         this->setPathEffect(NULL);
         this->setShader(NULL);
@@ -2051,7 +2148,16 @@
     return false;
 }
 
-///////////////////////////////////////////////////////////////////////////////
+bool SkImageFilter::asAnErode(SkISize* radius) const {
+    return false;
+}
+
+bool SkImageFilter::asADilate(SkISize* radius) const {
+    return false;
+}
+
+//////
+
 bool SkDrawLooper::canComputeFastBounds(const SkPaint& paint) {
     SkCanvas canvas;
 
@@ -2073,7 +2179,7 @@
 void SkDrawLooper::computeFastBounds(const SkPaint& paint, const SkRect& src,
                                      SkRect* dst) {
     SkCanvas canvas;
-
+    
     this->init(&canvas);
     for (bool firstTime = true;; firstTime = false) {
         SkPaint p(paint);
diff --git a/src/core/SkPaintDefaults.h b/src/core/SkPaintDefaults.h
new file mode 100644
index 0000000..3ea1cd3
--- /dev/null
+++ b/src/core/SkPaintDefaults.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2012 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkPaintDefaults_DEFINED
+#define SkPaintDefaults_DEFINED
+
+#include "SkPaint.h"
+
+/**
+ *  Any of these can be specified by the build system (or SkUserConfig.h)
+ *  to change the default values for a SkPaint. This file should not be
+ *  edited directly.
+ */
+
+#ifndef SkPaintDefaults_Flags
+    #define SkPaintDefaults_Flags           0
+#endif
+
+#ifndef SkPaintDefaults_TextSize
+    #define SkPaintDefaults_TextSize        SkIntToScalar(12)
+#endif
+
+#ifndef SkPaintDefaults_Hinting
+    #define SkPaintDefaults_Hinting         SkPaint::kNormal_Hinting
+#endif
+
+#ifndef SkPaintDefaults_MiterLimit
+    #define SkPaintDefaults_MiterLimit      SkIntToScalar(4)
+#endif
+
+#endif
diff --git a/src/core/SkPath.cpp b/src/core/SkPath.cpp
index c99db4c..7f58ae3 100644
--- a/src/core/SkPath.cpp
+++ b/src/core/SkPath.cpp
@@ -26,6 +26,12 @@
     dst->fBottom = SkMaxScalar(dst->fBottom, src.fBottom);
 }
 
+static bool is_degenerate(const SkPath& path) {
+    SkPath::Iter iter(path, false);
+    SkPoint pts[4];
+    return SkPath::kDone_Verb == iter.next(pts);
+}
+
 /*  This guy's constructor/destructor bracket a path editing operation. It is
     used when we know the bounds of the amount we are going to add to the path
     (usually a new contour, but not required).
@@ -34,8 +40,9 @@
     cached bounds), and the if it can, it updates the cache bounds explicitly,
     avoiding the need to revisit all of the points in getBounds().
 
-    It also notes if the path was originally empty, and if so, sets isConvex
-    to true. Thus it can only be used if the contour being added is convex.
+    It also notes if the path was originally degenerate, and if so, sets
+    isConvex to true. Thus it can only be used if the contour being added is
+    convex.
  */
 class SkAutoPathBoundsUpdate {
 public:
@@ -50,7 +57,7 @@
     }
 
     ~SkAutoPathBoundsUpdate() {
-        fPath->setIsConvex(fEmpty);
+        fPath->setIsConvex(fDegenerate);
         if (fEmpty) {
             fPath->fBounds = fRect;
             fPath->fBoundsIsDirty = false;
@@ -64,12 +71,14 @@
     SkPath* fPath;
     SkRect  fRect;
     bool    fDirty;
+    bool    fDegenerate;
     bool    fEmpty;
 
     // returns true if we should proceed
     void init(SkPath* path) {
         fPath = path;
         fDirty = SkToBool(path->fBoundsIsDirty);
+        fDegenerate = is_degenerate(*path);
         fEmpty = path->isEmpty();
         // Cannot use fRect for our bounds unless we know it is sorted
         fRect.sort();
@@ -101,13 +110,18 @@
 
 ////////////////////////////////////////////////////////////////////////////
 
+// flag to require a moveTo if we begin with something else, like lineTo etc.
+#define INITIAL_LASTMOVETOINDEX_VALUE   ~0
+
 SkPath::SkPath() 
     : fFillType(kWinding_FillType)
     , fBoundsIsDirty(true) {
     fConvexity = kUnknown_Convexity;
     fSegmentMask = 0;
+    fLastMoveToIndex = INITIAL_LASTMOVETOINDEX_VALUE;
 #ifdef SK_BUILD_FOR_ANDROID
     fGenerationID = 0;
+    fSourcePath = NULL;
 #endif
 }
 
@@ -116,7 +130,8 @@
     *this = src;
 #ifdef SK_BUILD_FOR_ANDROID
     // the assignment operator above increments the ID so correct for that here
-    fGenerationID--;
+    fGenerationID = src.fGenerationID;
+    fSourcePath = NULL;
 #endif
 }
 
@@ -135,6 +150,7 @@
         fBoundsIsDirty  = src.fBoundsIsDirty;
         fConvexity      = src.fConvexity;
         fSegmentMask    = src.fSegmentMask;
+        fLastMoveToIndex = src.fLastMoveToIndex;
         GEN_ID_INC;
     }
     SkDEBUGCODE(this->validate();)
@@ -165,6 +181,7 @@
         SkTSwap<uint8_t>(fBoundsIsDirty, other.fBoundsIsDirty);
         SkTSwap<uint8_t>(fConvexity, other.fConvexity);
         SkTSwap<uint8_t>(fSegmentMask, other.fSegmentMask);
+        SkTSwap<int>(fLastMoveToIndex, other.fLastMoveToIndex);
         GEN_ID_INC;
     }
 }
@@ -173,6 +190,14 @@
 uint32_t SkPath::getGenerationID() const {
     return fGenerationID;
 }
+
+const SkPath* SkPath::getSourcePath() const {
+    return fSourcePath;
+}
+
+void SkPath::setSourcePath(const SkPath* path) {
+    fSourcePath = path;
+}
 #endif
 
 void SkPath::reset() {
@@ -184,6 +209,7 @@
     fBoundsIsDirty = true;
     fConvexity = kUnknown_Convexity;
     fSegmentMask = 0;
+    fLastMoveToIndex = INITIAL_LASTMOVETOINDEX_VALUE;
 }
 
 void SkPath::rewind() {
@@ -195,16 +221,12 @@
     fConvexity = kUnknown_Convexity;
     fBoundsIsDirty = true;
     fSegmentMask = 0;
+    fLastMoveToIndex = INITIAL_LASTMOVETOINDEX_VALUE;
 }
 
 bool SkPath::isEmpty() const {
     SkDEBUGCODE(this->validate();)
-#if SK_OLD_EMPTY_PATH_BEHAVIOR
-    int count = fVerbs.count();
-    return count == 0 || (count == 1 && fVerbs[0] == kMove_Verb);
-#else
     return 0 == fVerbs.count();
-#endif
 }
 
 /*
@@ -395,6 +417,11 @@
         fConvexity = kUnknown_Convexity; \
     } while (0)
 
+#define DIRTY_AFTER_EDIT_NO_CONVEXITY_CHANGE    \
+    do {                                        \
+        fBoundsIsDirty = true;                  \
+    } while (0)
+
 void SkPath::incReserve(U16CPU inc) {
     SkDEBUGCODE(this->validate();)
 
@@ -410,21 +437,15 @@
     int      vc = fVerbs.count();
     SkPoint* pt;
 
-#ifdef SK_OLD_EMPTY_PATH_BEHAVIOR
-    if (vc > 0 && fVerbs[vc - 1] == kMove_Verb) {
-        pt = &fPts[fPts.count() - 1];
-    } else {
-        pt = fPts.append();
-        *fVerbs.append() = kMove_Verb;
-    }
-#else
+    // remember our index
+    fLastMoveToIndex = fPts.count();
+
     pt = fPts.append();
     *fVerbs.append() = kMove_Verb;
-#endif
     pt->set(x, y);
 
     GEN_ID_INC;
-    DIRTY_AFTER_EDIT;
+    DIRTY_AFTER_EDIT_NO_CONVEXITY_CHANGE;
 }
 
 void SkPath::rMoveTo(SkScalar x, SkScalar y) {
@@ -433,13 +454,25 @@
     this->moveTo(pt.fX + x, pt.fY + y);
 }
 
+void SkPath::injectMoveToIfNeeded() {
+    if (fLastMoveToIndex < 0) {
+        SkScalar x, y;
+        if (fVerbs.count() == 0) {
+            x = y = 0;
+        } else {
+            const SkPoint& pt = fPts[~fLastMoveToIndex];
+            x = pt.fX;
+            y = pt.fY;
+        }
+        this->moveTo(x, y);
+    }
+}
+
 void SkPath::lineTo(SkScalar x, SkScalar y) {
     SkDEBUGCODE(this->validate();)
 
-    if (fVerbs.count() == 0) {
-        fPts.append()->set(0, 0);
-        *fVerbs.append() = kMove_Verb;
-    }
+    this->injectMoveToIfNeeded();
+
     fPts.append()->set(x, y);
     *fVerbs.append() = kLine_Verb;
     fSegmentMask |= kLine_SegmentMask;
@@ -457,10 +490,7 @@
 void SkPath::quadTo(SkScalar x1, SkScalar y1, SkScalar x2, SkScalar y2) {
     SkDEBUGCODE(this->validate();)
 
-    if (fVerbs.count() == 0) {
-        fPts.append()->set(0, 0);
-        *fVerbs.append() = kMove_Verb;
-    }
+    this->injectMoveToIfNeeded();
 
     SkPoint* pts = fPts.append(2);
     pts[0].set(x1, y1);
@@ -482,10 +512,8 @@
                      SkScalar x3, SkScalar y3) {
     SkDEBUGCODE(this->validate();)
 
-    if (fVerbs.count() == 0) {
-        fPts.append()->set(0, 0);
-        *fVerbs.append() = kMove_Verb;
-    }
+    this->injectMoveToIfNeeded();
+
     SkPoint* pts = fPts.append(3);
     pts[0].set(x1, y1);
     pts[1].set(x2, y2);
@@ -514,9 +542,7 @@
             case kLine_Verb:
             case kQuad_Verb:
             case kCubic_Verb:
-#ifndef SK_OLD_EMPTY_PATH_BEHAVIOR
             case kMove_Verb:
-#endif
                 *fVerbs.append() = kClose_Verb;
                 GEN_ID_INC;
                 break;
@@ -525,6 +551,15 @@
                 break;
         }
     }
+
+    // signal that we need a moveTo to follow us (unless we're done)
+#if 0
+    if (fLastMoveToIndex >= 0) {
+        fLastMoveToIndex = ~fLastMoveToIndex;
+    }
+#else
+    fLastMoveToIndex ^= ~fLastMoveToIndex >> (8 * sizeof(fLastMoveToIndex) - 1);
+#endif
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1203,8 +1238,9 @@
 ///////////////////////////////////////////////////////////////////////////////
 
 enum SegmentState {
-    kAfterClose_SegmentState,     // We will need a move next, but we have a
-                                  // previous close pt to use for the new move.
+    kEmptyContour_SegmentState,   // The current contour is empty. We may be
+                                  // starting processing or we may have just
+                                  // closed a contour.
     kAfterMove_SegmentState,      // We have seen a move, but nothing else.
     kAfterPrimitive_SegmentState  // We have seen a primitive but not yet
                                   // closed the path. Also the initial state.
@@ -1215,7 +1251,7 @@
     fPts = NULL;
     fMoveTo.fX = fMoveTo.fY = fLastPt.fX = fLastPt.fY = 0;
     fForceClose = fCloseLine = false;
-    fSegmentState = kAfterPrimitive_SegmentState;
+    fSegmentState = kEmptyContour_SegmentState;
 #endif
     // need to init enough to make next() harmlessly return kDone_Verb
     fVerbs = NULL;
@@ -1235,7 +1271,7 @@
     fMoveTo.fX = fMoveTo.fY = 0;
     fForceClose = SkToU8(forceClose);
     fNeedClose = false;
-    fSegmentState = kAfterClose_SegmentState;
+    fSegmentState = kEmptyContour_SegmentState;
 }
 
 bool SkPath::Iter::isClosedContour() const {
@@ -1289,18 +1325,6 @@
 }
 
 bool SkPath::Iter::cons_moveTo(SkPoint pts[1]) {
-    if (fSegmentState == kAfterClose_SegmentState) {
-        // We have closed a curve and have a primitive, so we need a move.
-        // Set the first return pt to the most recent move pt
-        if (pts) {
-            *pts = fMoveTo;
-        }
-        fNeedClose = fForceClose;
-        fSegmentState = kAfterMove_SegmentState;
-        fVerbs -= 1; // Step back to see the primitive again
-        return true;
-    }
-
     if (fSegmentState == kAfterMove_SegmentState) {
         // Set the first return pt to the move pt
         if (pts) {
@@ -1393,17 +1417,11 @@
 }
 
 SkPath::Verb SkPath::Iter::next(SkPoint pts[4]) {
-#ifndef SK_OLD_EMPTY_PATH_BEHAVIOR
     this->consumeDegenerateSegments();
-#endif
 
     if (fVerbs == fVerbStop) {
         // Close the curve if requested and if there is some curve to close
-#ifdef SK_OLD_EMPTY_PATH_BEHAVIOR
-        if (fNeedClose) {
-#else
         if (fNeedClose && fSegmentState == kAfterPrimitive_SegmentState) {
-#endif
             if (kLine_Verb == this->autoClose(pts)) {
                 return kLine_Verb;
             }
@@ -1435,9 +1453,7 @@
             }
             srcPts += 1;
             fSegmentState = kAfterMove_SegmentState;
-#ifndef SK_OLD_EMPTY_PATH_BEHAVIOR
             fLastPt = fMoveTo;
-#endif
             fNeedClose = fForceClose;
             break;
         case kLine_Verb:
@@ -1477,15 +1493,9 @@
                 fVerbs -= 1;
             } else {
                 fNeedClose = false;
-#ifndef SK_OLD_EMPTY_PATH_BEHAVIOR
-                fSegmentState = kAfterClose_SegmentState;
-#endif
+                fSegmentState = kEmptyContour_SegmentState;
             }
-#ifdef SK_OLD_EMPTY_PATH_BEHAVIOR
-            fSegmentState = kAfterClose_SegmentState;
-#else
             fLastPt = fMoveTo;
-#endif
             break;
     }
     fPts = srcPts;
@@ -1909,21 +1919,41 @@
     SkDEBUGCODE(++fContourCounter;)
 }
 
+// returns cross product of (p1 - p0) and (p2 - p0)
 static SkScalar cross_prod(const SkPoint& p0, const SkPoint& p1, const SkPoint& p2) {
-    return SkPoint::CrossProduct(p1 - p0, p2 - p0);
+    SkScalar cross = SkPoint::CrossProduct(p1 - p0, p2 - p0);
+    // We may get 0 when the above subtracts underflow. We expect this to be
+    // very rare and lazily promote to double.
+    if (0 == cross) {
+        double p0x = SkScalarToDouble(p0.fX);
+        double p0y = SkScalarToDouble(p0.fY);
+
+        double p1x = SkScalarToDouble(p1.fX);
+        double p1y = SkScalarToDouble(p1.fY);
+
+        double p2x = SkScalarToDouble(p2.fX);
+        double p2y = SkScalarToDouble(p2.fY);
+
+        cross = SkDoubleToScalar((p1x - p0x) * (p2y - p0y) -
+                                 (p1y - p0y) * (p2x - p0x));
+
+    }
+    return cross;
 }
 
+// Returns the first pt with the maximum Y coordinate
 static int find_max_y(const SkPoint pts[], int count) {
     SkASSERT(count > 0);
     SkScalar max = pts[0].fY;
-    int maxIndex = 0;
+    int firstIndex = 0;
     for (int i = 1; i < count; ++i) {
-        if (pts[i].fY > max) {
-            max = pts[i].fY;
-            maxIndex = i;
+        SkScalar y = pts[i].fY;
+        if (y > max) {
+            max = y;
+            firstIndex = i;
         }
     }
-    return maxIndex;
+    return firstIndex;
 }
 
 static int find_diff_pt(const SkPoint pts[], int index, int n, int inc) {
@@ -1940,13 +1970,71 @@
     return i;
 }
 
+/**
+ *  Starting at index, and moving forward (incrementing), find the xmin and
+ *  xmax of the contiguous points that have the same Y.
+ */
+static int find_min_max_x_at_y(const SkPoint pts[], int index, int n,
+                               int* maxIndexPtr) {
+    const SkScalar y = pts[index].fY;
+    SkScalar min = pts[index].fX;
+    SkScalar max = min;
+    int minIndex = index;
+    int maxIndex = index;
+    for (int i = index + 1; i < n; ++i) {
+        if (pts[i].fY != y) {
+            break;
+        }
+        SkScalar x = pts[i].fX;
+        if (x < min) {
+            min = x;
+            minIndex = i;
+        } else if (x > max) {
+            max = x;
+            maxIndex = i;
+        }
+    }
+    *maxIndexPtr = maxIndex;
+    return minIndex;
+}
+
+static bool crossToDir(SkScalar cross, SkPath::Direction* dir) {
+    if (dir) {
+        *dir = cross > 0 ? SkPath::kCW_Direction : SkPath::kCCW_Direction;
+    }
+    return true;
+}
+
+#if 0
+#include "SkString.h"
+#include "../utils/SkParsePath.h"
+static void dumpPath(const SkPath& path) {
+    SkString str;
+    SkParsePath::ToSVGString(path, &str);
+    SkDebugf("%s\n", str.c_str());
+}
+#endif
+
+/*
+ *  We loop through all contours, and keep the computed cross-product of the
+ *  contour that contained the global y-max. If we just look at the first
+ *  contour, we may find one that is wound the opposite way (correctly) since
+ *  it is the interior of a hole (e.g. 'o'). Thus we must find the contour
+ *  that is outer most (or at least has the global y-max) before we can consider
+ *  its cross product.
+ */
 bool SkPath::cheapComputeDirection(Direction* dir) const {
+//    dumpPath(*this);
     // don't want to pay the cost for computing this if it
     // is unknown, so we don't call isConvex()
     const Convexity conv = this->getConvexityOrUnknown();
 
     ContourIter iter(fVerbs, fPts);
 
+    // initialize with our logical y-min
+    SkScalar ymax = this->getBounds().fTop;
+    SkScalar ymaxCross = 0;
+
     for (; !iter.done(); iter.next()) {
         int n = iter.count();
         if (n < 3) {
@@ -1961,36 +2049,65 @@
             for (int i = 0; i < n - 2; ++i) {
                 cross = cross_prod(pts[i], pts[i + 1], pts[i + 2]);
                 if (cross) {
-                    break;
+                    // early-exit, as kConvex is assumed to have only 1
+                    // non-degenerate contour
+                    return crossToDir(cross, dir);
                 }
             }
         } else {
             int index = find_max_y(pts, n);
-            // Find a next and prev index to use for the cross-product test,
-            // but we try to find pts that form non-zero vectors from pts[index]
-            //
-            // Its possible that we can't find two non-degenerate vectors, so
-            // we have to guard our search (e.g. all the pts could be in the
-            // same place).
-            
-            // we pass n - 1 instead of -1 so we don't foul up % operator by
-            // passing it a negative LH argument.
-            int prev = find_diff_pt(pts, index, n, n - 1);
-            if (prev == index) {
-                // completely degenerate, skip to next contour
+            if (pts[index].fY < ymax) {
                 continue;
             }
-            int next = find_diff_pt(pts, index, n, 1);
-            SkASSERT(next != index);
-            cross = cross_prod(pts[prev], pts[index], pts[next]);
-        }
-        if (cross) {
-            if (dir) {
-                *dir = cross > 0 ? kCW_Direction : kCCW_Direction;
+
+            // If there is more than 1 distinct point at the y-max, we take the
+            // x-min and x-max of them and just subtract to compute the dir.
+            if (pts[(index + 1) % n].fY == pts[index].fY) {
+                int maxIndex;
+                int minIndex = find_min_max_x_at_y(pts, index, n, &maxIndex);
+                if (minIndex == maxIndex) {
+                    goto TRY_CROSSPROD;
+                }
+                SkASSERT(pts[minIndex].fY == pts[index].fY);
+                SkASSERT(pts[maxIndex].fY == pts[index].fY);
+                SkASSERT(pts[minIndex].fX <= pts[maxIndex].fX);
+                // we just subtract the indices, and let that auto-convert to
+                // SkScalar, since we just want - or + to signal the direction.
+                cross = minIndex - maxIndex;
+            } else {
+                TRY_CROSSPROD:
+                // Find a next and prev index to use for the cross-product test,
+                // but we try to find pts that form non-zero vectors from pts[index]
+                //
+                // Its possible that we can't find two non-degenerate vectors, so
+                // we have to guard our search (e.g. all the pts could be in the
+                // same place).
+                
+                // we pass n - 1 instead of -1 so we don't foul up % operator by
+                // passing it a negative LH argument.
+                int prev = find_diff_pt(pts, index, n, n - 1);
+                if (prev == index) {
+                    // completely degenerate, skip to next contour
+                    continue;
+                }
+                int next = find_diff_pt(pts, index, n, 1);
+                SkASSERT(next != index);
+                cross = cross_prod(pts[prev], pts[index], pts[next]);
+                // if we get a zero, but the pts aren't on top of each other, then
+                // we can just look at the direction
+                if (0 == cross) {
+                    // construct the subtract so we get the correct Direction below
+                    cross = pts[index].fX - pts[next].fX;
+                }
             }
-            return true;
+            
+            if (cross) {
+                // record our best guess so far
+                ymax = pts[index].fY;
+                ymaxCross = cross;
+            }
         }
     }
-    return false;   // unknown
-}
 
+    return ymaxCross ? crossToDir(ymaxCross, dir) : false;
+}
diff --git a/src/core/SkPathEffect.cpp b/src/core/SkPathEffect.cpp
index 3e017e2..d81ac9f 100644
--- a/src/core/SkPathEffect.cpp
+++ b/src/core/SkPathEffect.cpp
@@ -81,7 +81,7 @@
         : fWidth(width), fMiter(miter), fStyle(SkToU8(style)),
           fJoin(SkToU8(join)), fCap(SkToU8(cap)) {
     if (miter < 0) {  // signal they want the default
-        fMiter = SK_DefaultMiterLimit;
+        fMiter = SkIntToScalar(4);
     }
 }
 
diff --git a/src/core/SkPathMeasure.cpp b/src/core/SkPathMeasure.cpp
index 3158925..04d19c2 100644
--- a/src/core/SkPathMeasure.cpp
+++ b/src/core/SkPathMeasure.cpp
@@ -15,7 +15,6 @@
 // these must be 0,1,2 since they are in our 2-bit field
 enum {
     kLine_SegType,
-    kCloseLine_SegType,
     kQuad_SegType,
     kCubic_SegType
 };
@@ -154,40 +153,43 @@
     Segment*        seg;
 
     fSegments.reset();
-    for (;;) {
+    bool done = false;
+    do {
         switch (fIter.next(pts)) {
             case SkPath::kMove_Verb:
+                ptIndex += 1;
+                fPts.append(1, pts);
                 if (!firstMoveTo) {
-                    goto DONE;
+                    done = true;
+                    break;
                 }
-            ptIndex += 1;
-            firstMoveTo = false;
-            break;
+                firstMoveTo = false;
+                break;
 
             case SkPath::kLine_Verb:
                 d = SkPoint::Distance(pts[0], pts[1]);
                 SkASSERT(d >= 0);
-                if (!SkScalarNearlyZero(d)) {
-                    distance += d;
-                    seg = fSegments.append();
-                    seg->fDistance = distance;
-                    seg->fPtIndex = ptIndex;
-                    seg->fType = fIter.isCloseLine() ?
-                                    kCloseLine_SegType : kLine_SegType;
-                    seg->fTValue = kMaxTValue;
-                }
-                ptIndex += !fIter.isCloseLine();
+                distance += d;
+                seg = fSegments.append();
+                seg->fDistance = distance;
+                seg->fPtIndex = ptIndex;
+                seg->fType = kLine_SegType;
+                seg->fTValue = kMaxTValue;
+                fPts.append(1, pts + 1);
+                ptIndex++;
                 break;
 
             case SkPath::kQuad_Verb:
                 distance = this->compute_quad_segs(pts, distance, 0,
                                                    kMaxTValue, ptIndex);
+                fPts.append(2, pts + 1);
                 ptIndex += 2;
                 break;
 
             case SkPath::kCubic_Verb:
                 distance = this->compute_cubic_segs(pts, distance, 0,
                                                     kMaxTValue, ptIndex);
+                fPts.append(3, pts + 1);
                 ptIndex += 3;
                 break;
 
@@ -196,13 +198,14 @@
                 break;
                 
             case SkPath::kDone_Verb:
-                goto DONE;
+                done = true;
+                break;
         }
-    }
-DONE:
+    } while (!done);
+
     fLength = distance;
     fIsClosed = isClosed;
-    fFirstPtIndex = ptIndex + 1;
+    fFirstPtIndex = ptIndex;
 
 #ifdef SK_DEBUG
     {
@@ -232,31 +235,20 @@
 #endif
 }
 
-// marked as a friend in SkPath.h
-const SkPoint* sk_get_path_points(const SkPath& path, int index) {
-    return &path.fPts[index];
-}
-
-static void compute_pos_tan(const SkPath& path, int firstPtIndex, int ptIndex,
+static void compute_pos_tan(const SkTDArray<SkPoint>& segmentPts, int ptIndex,
                     int segType, SkScalar t, SkPoint* pos, SkVector* tangent) {
-    const SkPoint*  pts = sk_get_path_points(path, ptIndex);
+    const SkPoint*  pts = &segmentPts[ptIndex];
 
     switch (segType) {
         case kLine_SegType:
-        case kCloseLine_SegType: {
-            const SkPoint* endp = (segType == kLine_SegType) ?
-                                    &pts[1] :
-                                    sk_get_path_points(path, firstPtIndex);
-
             if (pos) {
-                pos->set(SkScalarInterp(pts[0].fX, endp->fX, t),
-                        SkScalarInterp(pts[0].fY, endp->fY, t));
+                pos->set(SkScalarInterp(pts[0].fX, pts[1].fX, t),
+                        SkScalarInterp(pts[0].fY, pts[1].fY, t));
             }
             if (tangent) {
-                tangent->setNormalize(endp->fX - pts[0].fX, endp->fY - pts[0].fY);
+                tangent->setNormalize(pts[1].fX - pts[0].fX, pts[1].fY - pts[0].fY);
             }
             break;
-        }
         case kQuad_SegType:
             SkEvalQuadAt(pts, t, pos, tangent);
             if (tangent) {
@@ -274,7 +266,7 @@
     }
 }
 
-static void seg_to(const SkPath& src, int firstPtIndex, int ptIndex,
+static void seg_to(const SkTDArray<SkPoint>& segmentPts, int ptIndex,
                    int segType, SkScalar startT, SkScalar stopT, SkPath* dst) {
     SkASSERT(startT >= 0 && startT <= SK_Scalar1);
     SkASSERT(stopT >= 0 && stopT <= SK_Scalar1);
@@ -284,24 +276,18 @@
         return;
     }
 
-    const SkPoint*  pts = sk_get_path_points(src, ptIndex);
+    const SkPoint*  pts = &segmentPts[ptIndex];
     SkPoint         tmp0[7], tmp1[7];
 
     switch (segType) {
         case kLine_SegType:
-        case kCloseLine_SegType: {
-            const SkPoint* endp = (segType == kLine_SegType) ?
-                                    &pts[1] :
-                                    sk_get_path_points(src, firstPtIndex);
-
             if (stopT == kMaxTValue) {
-                dst->lineTo(*endp);
+                dst->lineTo(pts[1]);
             } else {
-                dst->lineTo(SkScalarInterp(pts[0].fX, endp->fX, stopT),
-                            SkScalarInterp(pts[0].fY, endp->fY, stopT));
+                dst->lineTo(SkScalarInterp(pts[0].fX, pts[1].fX, stopT),
+                            SkScalarInterp(pts[0].fY, pts[1].fY, stopT));
             }
             break;
-        }
         case kQuad_SegType:
             if (startT == 0) {
                 if (stopT == SK_Scalar1) {
@@ -379,6 +365,7 @@
         fIter.setPath(*path, forceClosed);
     }
     fSegments.reset();
+    fPts.reset();
 }
 
 SkScalar SkPathMeasure::getLength() {
@@ -431,7 +418,6 @@
                               SkVector* tangent) {
     SkASSERT(fPath);
     if (fPath == NULL) {
-    EMPTY:
         return false;
     }
 
@@ -439,7 +425,7 @@
     int         count = fSegments.count();
 
     if (count == 0 || length == 0) {
-        goto EMPTY;
+        return false;
     }
 
     // pin the distance to a legal range
@@ -452,8 +438,7 @@
     SkScalar        t;
     const Segment*  seg = this->distanceToSegment(distance, &t);
 
-    compute_pos_tan(*fPath, fSegments[0].fPtIndex, seg->fPtIndex, seg->fType,
-                    t, pos, tangent);
+    compute_pos_tan(fPts, seg->fPtIndex, seg->fType, t, pos, tangent);
     return true;
 }
 
@@ -501,23 +486,19 @@
     SkASSERT(seg <= stopSeg);
 
     if (startWithMoveTo) {
-        compute_pos_tan(*fPath, fSegments[0].fPtIndex, seg->fPtIndex,
-                        seg->fType, startT, &p, NULL);
+        compute_pos_tan(fPts, seg->fPtIndex, seg->fType, startT, &p, NULL);
         dst->moveTo(p);
     }
 
     if (seg->fPtIndex == stopSeg->fPtIndex) {
-        seg_to(*fPath, fSegments[0].fPtIndex, seg->fPtIndex, seg->fType,
-               startT, stopT, dst);
+        seg_to(fPts, seg->fPtIndex, seg->fType, startT, stopT, dst);
     } else {
         do {
-            seg_to(*fPath, fSegments[0].fPtIndex, seg->fPtIndex, seg->fType,
-                   startT, SK_Scalar1, dst);
+            seg_to(fPts, seg->fPtIndex, seg->fType, startT, SK_Scalar1, dst);
             seg = SkPathMeasure::NextSegment(seg);
             startT = 0;
         } while (seg->fPtIndex < stopSeg->fPtIndex);
-        seg_to(*fPath, fSegments[0].fPtIndex, seg->fPtIndex, seg->fType,
-               0, stopT, dst);
+        seg_to(fPts, seg->fPtIndex, seg->fType, 0, stopT, dst);
     }
     return true;
 }
diff --git a/src/core/SkPicture.cpp b/src/core/SkPicture.cpp
index a3b7396..6aaaf2d 100644
--- a/src/core/SkPicture.cpp
+++ b/src/core/SkPicture.cpp
@@ -192,7 +192,9 @@
 #define PICTURE_VERSION     1
 
 SkPicture::SkPicture(SkStream* stream) : SkRefCnt() {
-    if (stream->readU32() != PICTURE_VERSION) {
+    const uint32_t  pictureVersion = stream->readU32();
+    if (pictureVersion != PICTURE_VERSION_ICS &&
+        pictureVersion != PICTURE_VERSION_JB) {
         sk_throw();
     }
 
@@ -203,7 +205,7 @@
     fPlayback = NULL;
 
     if (stream->readBool()) {
-        fPlayback = SkNEW_ARGS(SkPicturePlayback, (stream));
+        fPlayback = SkNEW_ARGS(SkPicturePlayback, (stream, pictureVersion));
     }
 }
 
@@ -214,7 +216,7 @@
         playback = SkNEW_ARGS(SkPicturePlayback, (*fRecord));
     }
 
-    stream->write32(PICTURE_VERSION);
+    stream->write32(PICTURE_VERSION_JB);
     stream->write32(fWidth);
     stream->write32(fHeight);
     if (playback) {
diff --git a/src/core/SkPictureFlat.h b/src/core/SkPictureFlat.h
index 983bfc5..bfd253a 100644
--- a/src/core/SkPictureFlat.h
+++ b/src/core/SkPictureFlat.h
@@ -24,7 +24,6 @@
     CONCAT,
     DRAW_BITMAP,
     DRAW_BITMAP_MATRIX,
-    DRAW_BITMAP_NINE,
     DRAW_BITMAP_RECT,
     DRAW_CLEAR,
     DRAW_DATA,
@@ -37,6 +36,7 @@
     DRAW_POS_TEXT_H,
     DRAW_POS_TEXT_H_TOP_BOTTOM, // fast variant of DRAW_POS_TEXT_H
     DRAW_RECT,
+    DRAW_SHAPE,
     DRAW_SPRITE,
     DRAW_TEXT,
     DRAW_TEXT_ON_PATH,
@@ -49,7 +49,8 @@
     SCALE,
     SET_MATRIX,
     SKEW,
-    TRANSLATE
+    TRANSLATE,
+    DRAW_BITMAP_NINE
 };
 
 enum DrawVertexFlags {
diff --git a/src/core/SkPicturePlayback.cpp b/src/core/SkPicturePlayback.cpp
index c2082c7..f5756a3 100644
--- a/src/core/SkPicturePlayback.cpp
+++ b/src/core/SkPicturePlayback.cpp
@@ -263,6 +263,7 @@
 #define PICT_PAINT_TAG      SkSetFourByteTag('p', 'n', 't', ' ')
 #define PICT_PATH_TAG       SkSetFourByteTag('p', 't', 'h', ' ')
 #define PICT_REGION_TAG     SkSetFourByteTag('r', 'g', 'n', ' ')
+#define PICT_SHAPE_TAG      SkSetFourByteTag('s', 'h', 'p', ' ')
 
 #include "SkStream.h"
 
@@ -391,7 +392,7 @@
     return stream->readU32();
 }
 
-SkPicturePlayback::SkPicturePlayback(SkStream* stream) {
+SkPicturePlayback::SkPicturePlayback(SkStream* stream, uint32_t pictureVersion) {
     this->init();
 
     int i;
@@ -401,6 +402,7 @@
         void* storage = sk_malloc_throw(size);
         stream->read(storage, size);
         fReader.setMemory(storage, size);
+        fReader.setPictureVersion(pictureVersion);
     }
 
     int factoryCount = readTagSize(stream, PICT_FACTORY_TAG);
@@ -434,6 +436,7 @@
     stream->read(storage.get(), size);
 
     SkFlattenableReadBuffer buffer(storage.get(), size);
+    buffer.setPictureVersion(pictureVersion);
     fFactoryPlayback->setupBuffer(buffer);
     fTFPlayback.setupBuffer(buffer);
 
@@ -467,6 +470,13 @@
         SkDEBUGCODE(uint32_t bytes =) fRegions[i].unflatten(buffer.skip(size));
         SkASSERT(size == bytes);
     }
+
+    if (pictureVersion == PICTURE_VERSION_ICS) {
+        int shapeCount = readTagSize(buffer, PICT_SHAPE_TAG);
+        for (i = 0; i < shapeCount; i++) {
+            buffer.readFlattenable();
+        }
+    }
 }
 
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/src/core/SkPicturePlayback.h b/src/core/SkPicturePlayback.h
index 88f86e2..70bd0ce 100644
--- a/src/core/SkPicturePlayback.h
+++ b/src/core/SkPicturePlayback.h
@@ -32,7 +32,7 @@
     SkPicturePlayback();
     SkPicturePlayback(const SkPicturePlayback& src);
     explicit SkPicturePlayback(const SkPictureRecord& record);
-    explicit SkPicturePlayback(SkStream*);
+    explicit SkPicturePlayback(SkStream*, uint32_t pictureVersion = PICTURE_VERSION_JB);
 
     virtual ~SkPicturePlayback();
 
@@ -41,7 +41,7 @@
     void serialize(SkWStream*) const;
 
     void dumpSize() const;
-    
+
     // Can be called in the middle of playback (the draw() call). WIll abort the
     // drawing and return from draw() after the "current" op code is done
     void abort();
@@ -83,7 +83,7 @@
         SkASSERT(index > 0 && index <= fPictureCount);
         return *fPictureRefs[index - 1];
     }
-    
+
     const SkPaint* getPaint() {
         int index = getInt();
         if (index == 0) {
diff --git a/src/core/SkPictureRecord.cpp b/src/core/SkPictureRecord.cpp
index fd403cf..ba49b72 100644
--- a/src/core/SkPictureRecord.cpp
+++ b/src/core/SkPictureRecord.cpp
@@ -23,6 +23,7 @@
     fRestoreOffsetStack.push(0);
 
     fPathHeap = NULL;   // lazy allocate
+    fFirstSavedLayerIndex = kNoSavedLayerIndex;
 }
 
 SkPictureRecord::~SkPictureRecord() {
@@ -50,6 +51,10 @@
 
     fRestoreOffsetStack.push(0);
 
+    if (kNoSavedLayerIndex == fFirstSavedLayerIndex) {
+        fFirstSavedLayerIndex = fRestoreOffsetStack.count();
+    }
+
     validate();
     /*  Don't actually call saveLayer, because that will try to allocate an
         offscreen device (potentially very big) which we don't actually need
@@ -57,7 +62,13 @@
         clip starts out the size of the picture, which is often much larger
         than the size of the actual device we'll use during playback).
      */
-    return this->INHERITED::save(flags);
+    int count = this->INHERITED::save(flags);
+    this->clipRectBounds(bounds, flags, NULL);
+    return count;
+}
+
+bool SkPictureRecord::isDrawingToLayer() const {
+    return fFirstSavedLayerIndex != kNoSavedLayerIndex;
 }
 
 void SkPictureRecord::restore() {
@@ -74,6 +85,11 @@
         offset = *peek;
         *peek = restoreOffset;
     }
+
+    if (fRestoreOffsetStack.count() == fFirstSavedLayerIndex) {
+        fFirstSavedLayerIndex = kNoSavedLayerIndex;
+    }
+
     fRestoreOffsetStack.pop();
 
     addDraw(RESTORE);
@@ -157,7 +173,7 @@
             *peek = 0;
         }
     }
-    
+
     size_t offset = fWriter.size();
     addInt(fRestoreOffsetStack.top());
     fRestoreOffsetStack.top() = offset;
diff --git a/src/core/SkPictureRecord.h b/src/core/SkPictureRecord.h
index 3c85b9b..dfddffe 100644
--- a/src/core/SkPictureRecord.h
+++ b/src/core/SkPictureRecord.h
@@ -49,14 +49,14 @@
                                 const SkRect& dst, const SkPaint*) SK_OVERRIDE;
     virtual void drawSprite(const SkBitmap&, int left, int top,
                             const SkPaint*) SK_OVERRIDE;
-    virtual void drawText(const void* text, size_t byteLength, SkScalar x, 
+    virtual void drawText(const void* text, size_t byteLength, SkScalar x,
                           SkScalar y, const SkPaint&) SK_OVERRIDE;
-    virtual void drawPosText(const void* text, size_t byteLength, 
+    virtual void drawPosText(const void* text, size_t byteLength,
                              const SkPoint pos[], const SkPaint&) SK_OVERRIDE;
     virtual void drawPosTextH(const void* text, size_t byteLength,
                       const SkScalar xpos[], SkScalar constY, const SkPaint&) SK_OVERRIDE;
-    virtual void drawTextOnPath(const void* text, size_t byteLength, 
-                            const SkPath& path, const SkMatrix* matrix, 
+    virtual void drawTextOnPath(const void* text, size_t byteLength,
+                            const SkPath& path, const SkMatrix* matrix,
                                 const SkPaint&) SK_OVERRIDE;
     virtual void drawPicture(SkPicture& picture) SK_OVERRIDE;
     virtual void drawVertices(VertexMode, int vertexCount,
@@ -65,6 +65,7 @@
                           const uint16_t indices[], int indexCount,
                               const SkPaint&) SK_OVERRIDE;
     virtual void drawData(const void*, size_t) SK_OVERRIDE;
+    virtual bool isDrawingToLayer() const SK_OVERRIDE;
 
     void addFontMetricsTopBottom(const SkPaint& paint, SkScalar minY, SkScalar maxY);
 
@@ -83,7 +84,7 @@
     const SkTDArray<const SkFlatRegion* >& getRegions() const {
         return fRegions;
     }
-    
+
     void reset();
 
     const SkWriter32& writeStream() const {
@@ -92,20 +93,24 @@
 
 private:
     SkTDArray<uint32_t> fRestoreOffsetStack;
+    int fFirstSavedLayerIndex;
+    enum {
+        kNoSavedLayerIndex = -1
+    };
 
     void addDraw(DrawType drawType) {
 #ifdef SK_DEBUG_TRACE
         SkDebugf("add %s\n", DrawTypeToString(drawType));
 #endif
         fWriter.writeInt(drawType);
-    }    
+    }
     void addInt(int value) {
         fWriter.writeInt(value);
     }
     void addScalar(SkScalar scalar) {
         fWriter.writeScalar(scalar);
     }
-    
+
     void addBitmap(const SkBitmap& bitmap);
     void addMatrix(const SkMatrix& matrix);
     void addMatrixPtr(const SkMatrix* matrix);
@@ -137,14 +142,14 @@
 
 #ifdef SK_DEBUG_SIZE
 public:
-    size_t size() const;    
+    size_t size() const;
     int bitmaps(size_t* size) const;
     int matrices(size_t* size) const;
     int paints(size_t* size) const;
     int paths(size_t* size) const;
     int regions(size_t* size) const;
     size_t streamlen() const;
-    
+
     size_t fPointBytes, fRectBytes, fTextBytes;
     int fPointWrites, fRectWrites, fTextWrites;
 #endif
@@ -181,13 +186,14 @@
 
     SkRefCntSet fRCSet;
     SkRefCntSet fTFSet;
-    
+
     uint32_t fRecordFlags;
 
     // helper function to handle save/restore culling offsets
     void recordOffsetForRestore(SkRegion::Op op);
 
     friend class SkPicturePlayback;
+    friend class SkPictureTester; // for unit testing
 
     typedef SkCanvas INHERITED;
 };
diff --git a/src/core/SkPixelRef.cpp b/src/core/SkPixelRef.cpp
index 1f28ae7..d5e1b81 100644
--- a/src/core/SkPixelRef.cpp
+++ b/src/core/SkPixelRef.cpp
@@ -9,7 +9,7 @@
 #include "SkFlattenable.h"
 #include "SkThread.h"
 
-static SkMutex  gPixelRefMutex;
+SK_DECLARE_STATIC_MUTEX(gPixelRefMutex);
 
 extern int32_t SkNextPixelRefGenerationID();
 int32_t SkNextPixelRefGenerationID() {
@@ -24,7 +24,7 @@
 }
 
 
-SkPixelRef::SkPixelRef(SkMutex* mutex) {
+SkPixelRef::SkPixelRef(SkBaseMutex* mutex) {
     if (NULL == mutex) {
         mutex = &gPixelRefMutex;
     }
@@ -36,7 +36,7 @@
     fIsImmutable = false;
 }
 
-SkPixelRef::SkPixelRef(SkFlattenableReadBuffer& buffer, SkMutex* mutex) {
+SkPixelRef::SkPixelRef(SkFlattenableReadBuffer& buffer, SkBaseMutex* mutex) {
     if (NULL == mutex) {
         mutex = &gPixelRefMutex;
     }
diff --git a/src/core/SkRasterClip.cpp b/src/core/SkRasterClip.cpp
index 9bf39fa..5a2447d 100644
--- a/src/core/SkRasterClip.cpp
+++ b/src/core/SkRasterClip.cpp
@@ -70,6 +70,8 @@
     if (this->isBW() && !doAA) {
         return fBW.setPath(path, clip);
     } else {
+        // TODO: since we are going to over-write fAA completely (aren't we?)
+        // we should just clear our BW data (if any) and set fIsAA=true
         if (this->isBW()) {
             this->convertToAA();
         }
diff --git a/src/core/SkRegion.cpp b/src/core/SkRegion.cpp
index 0a8ab65..dee652b 100644
--- a/src/core/SkRegion.cpp
+++ b/src/core/SkRegion.cpp
@@ -455,18 +455,18 @@
 
 /////////////////////////////////////////////////////////////////////////////////////
 
-bool operator==(const SkRegion& a, const SkRegion& b) {
-    SkDEBUGCODE(a.validate();)
+bool SkRegion::operator==(const SkRegion& b) const {
+    SkDEBUGCODE(validate();)
     SkDEBUGCODE(b.validate();)
 
-    if (&a == &b) {
+    if (this == &b) {
         return true;
     }
-    if (a.fBounds != b.fBounds) {
+    if (fBounds != b.fBounds) {
         return false;
     }
     
-    const SkRegion::RunHead* ah = a.fRunHead;
+    const SkRegion::RunHead* ah = fRunHead;
     const SkRegion::RunHead* bh = b.fRunHead;
 
     // this catches empties and rects being equal
diff --git a/src/core/SkScan_AntiPath.cpp b/src/core/SkScan_AntiPath.cpp
index 97843ef..e80ad3e 100644
--- a/src/core/SkScan_AntiPath.cpp
+++ b/src/core/SkScan_AntiPath.cpp
@@ -33,10 +33,16 @@
     NEW_AA is a set of code-changes to try to make both paths produce identical
     results. Its not quite there yet, though the remaining differences may be
     in the subsequent blits, and not in the different masks/runs...
+
+    SK_USE_EXACT_COVERAGE makes coverage_to_partial_alpha() behave similarly to
+    coverage_to_exact_alpha(). Enabling it will requrie rebaselining about 1/3
+    of GMs for changes in the 3 least significant bits along the edges of
+    antialiased spans.
  */
 //#define FORCE_SUPERMASK
 //#define FORCE_RLE
 //#define SK_SUPPORT_NEW_AA
+//#define SK_USE_EXACT_COVERAGE
 
 ///////////////////////////////////////////////////////////////////////////////
 
@@ -78,11 +84,13 @@
                                    const SkRegion& clip) {
     fRealBlitter = realBlitter;
 
-    // take the union of the ir bounds and clip, since we may be called with an
-    // inverse filltype
-    const int left = SkMin32(ir.fLeft, clip.getBounds().fLeft);
-    const int right = SkMax32(ir.fRight, clip.getBounds().fRight);
-
+    /*
+     *  We use the clip bounds instead of the ir, since we may be asked to
+     *  draw outside of the rect if we're a inverse filltype
+     */
+    const int left = clip.getBounds().fLeft;
+    const int right = clip.getBounds().fRight;
+    
     fLeft = left;
     fSuperLeft = left << SHIFT;
     fWidth = right - left;
@@ -150,12 +158,25 @@
     }
 }
 
-static inline int coverage_to_alpha(int aa) {
+/** coverage_to_partial_alpha() is being used by SkAlphaRuns, which
+    *accumulates* SCALE pixels worth of "alpha" in [0,(256/SCALE)]
+    to produce a final value in [0, 255] and handles clamping 256->255
+    itself, with the same (alpha - (alpha >> 8)) correction as
+    coverage_to_exact_alpha().
+*/
+static inline int coverage_to_partial_alpha(int aa) {
+#ifdef SK_USE_EXACT_COVERAGE
+    return aa << (8 - 2 * SHIFT);
+#else
     aa <<= 8 - 2*SHIFT;
     aa -= aa >> (8 - SHIFT - 1);
     return aa;
+#endif
 }
 
+/** coverage_to_exact_alpha() is being used by our blitter, which wants
+    a final value in [0, 255].
+*/
 static inline int coverage_to_exact_alpha(int aa) {
     int alpha = (256 >> SHIFT) * aa;
     // clamp 256->255
@@ -210,9 +231,8 @@
         }
     }
 
-    // TODO - should this be using coverage_to_exact_alpha?
-    fOffsetX = fRuns.add(x >> SHIFT, coverage_to_alpha(fb),
-                         n, coverage_to_alpha(fe),
+    fOffsetX = fRuns.add(x >> SHIFT, coverage_to_partial_alpha(fb),
+                         n, coverage_to_partial_alpha(fe),
                          (1 << (8 - SHIFT)) - (((y & MASK) + 1) >> SHIFT),
                          fOffsetX);
 
@@ -528,7 +548,7 @@
     if (n < 0) {
         SkASSERT(row >= fMask.fImage);
         SkASSERT(row < fMask.fImage + kMAX_STORAGE + 1);
-        add_aa_span(row, coverage_to_alpha(fe - fb));
+        add_aa_span(row, coverage_to_partial_alpha(fe - fb));
     } else {
 #ifdef SK_SUPPORT_NEW_AA
         if (0 == fb) {
@@ -541,7 +561,8 @@
 #endif
         SkASSERT(row >= fMask.fImage);
         SkASSERT(row + n + 1 < fMask.fImage + kMAX_STORAGE + 1);
-        add_aa_span(row,  coverage_to_alpha(fb), n, coverage_to_alpha(fe),
+        add_aa_span(row,  coverage_to_partial_alpha(fb),
+                    n, coverage_to_partial_alpha(fe),
                     (1 << (8 - SHIFT)) - (((y & MASK) + 1) >> SHIFT));
     }
 
@@ -552,47 +573,74 @@
 
 ///////////////////////////////////////////////////////////////////////////////
 
-/*  Returns non-zero if (value << shift) overflows a short, which would mean
-    we could not shift it up and then convert to SkFixed.
-    i.e. is x expressible as signed (16-shift) bits?
- */
-static int overflows_short_shift(int value, int shift) {
-    const int s = 16 + shift;
-    return (value << s >> s) - value;
+static bool fitsInsideLimit(const SkRect& r, SkScalar max) {
+    const SkScalar min = -max;
+    return  r.fLeft > min && r.fTop > min &&
+            r.fRight < max && r.fBottom < max;
 }
 
-void SkScan::AntiFillPath(const SkPath& path, const SkRegion& clip,
+static bool safeRoundOut(const SkRect& src, SkIRect* dst, int32_t maxInt) {
+#ifdef SK_SCALAR_IS_FIXED
+    // the max-int (shifted) is exactly what we want to compare against, to know
+    // if we can survive shifting our fixed-point coordinates
+    const SkFixed maxScalar = maxInt;
+#else
+    const SkScalar maxScalar = SkIntToScalar(maxInt);
+#endif
+    if (fitsInsideLimit(src, maxScalar)) {
+        src.roundOut(dst);
+        return true;
+    }
+    return false;
+}
+
+void SkScan::AntiFillPath(const SkPath& path, const SkRegion& origClip,
                           SkBlitter* blitter, bool forceRLE) {
-    if (clip.isEmpty()) {
+    if (origClip.isEmpty()) {
         return;
     }
 
     SkIRect ir;
-    path.getBounds().roundOut(&ir);
+
+    if (!safeRoundOut(path.getBounds(), &ir, SK_MaxS32 >> SHIFT)) {
+#if 0
+        const SkRect& r = path.getBounds();
+        SkDebugf("--- bounds can't fit in SkIRect\n", r.fLeft, r.fTop, r.fRight, r.fBottom);
+#endif
+        return;
+    }
     if (ir.isEmpty()) {
         if (path.isInverseFillType()) {
-            blitter->blitRegion(clip);
+            blitter->blitRegion(origClip);
         }
         return;
     }
 
-    // use bit-or since we expect all to pass, so no need to go slower with
-    // a short-circuiting logical-or
-    if (overflows_short_shift(ir.fLeft, SHIFT) |
-            overflows_short_shift(ir.fRight, SHIFT) |
-            overflows_short_shift(ir.fTop, SHIFT) |
-            overflows_short_shift(ir.fBottom, SHIFT)) {
-        // can't supersample, so draw w/o antialiasing
-        SkScan::FillPath(path, clip, blitter);
-        return;
+    // Our antialiasing can't handle a clip larger than 32767, so we restrict
+    // the clip to that limit here. (the runs[] uses int16_t for its index).
+    //
+    // A more general solution (one that could also eliminate the need to disable
+    // aa based on ir bounds (see overflows_short_shift) would be to tile the
+    // clip/target...
+    SkRegion tmpClipStorage;
+    const SkRegion* clipRgn = &origClip;
+    {
+        static const int32_t kMaxClipCoord = 32767;
+        const SkIRect& bounds = origClip.getBounds();
+        if (bounds.fRight > kMaxClipCoord || bounds.fBottom > kMaxClipCoord) {
+            SkIRect limit = { 0, 0, kMaxClipCoord, kMaxClipCoord };
+            tmpClipStorage.op(origClip, limit, SkRegion::kIntersect_Op);
+            clipRgn = &tmpClipStorage;
+        }
     }
+    // for here down, use clipRgn, not origClip
 
-    SkScanClipper   clipper(blitter, &clip, ir);
+    SkScanClipper   clipper(blitter, clipRgn, ir);
     const SkIRect*  clipRect = clipper.getClipRect();
 
     if (clipper.getBlitter() == NULL) { // clipped out
         if (path.isInverseFillType()) {
-            blitter->blitRegion(clip);
+            blitter->blitRegion(*clipRgn);
         }
         return;
     }
@@ -601,7 +649,7 @@
     blitter = clipper.getBlitter();
 
     if (path.isInverseFillType()) {
-        sk_blit_above(blitter, ir, clip);
+        sk_blit_above(blitter, ir, *clipRgn);
     }
 
     SkIRect superRect, *superClipRect = NULL;
@@ -617,16 +665,16 @@
     // MaskSuperBlitter can't handle drawing outside of ir, so we can't use it
     // if we're an inverse filltype
     if (!path.isInverseFillType() && MaskSuperBlitter::CanHandleRect(ir) && !forceRLE) {
-        MaskSuperBlitter    superBlit(blitter, ir, clip);
+        MaskSuperBlitter    superBlit(blitter, ir, *clipRgn);
         SkASSERT(SkIntToScalar(ir.fTop) <= path.getBounds().fTop);
-        sk_fill_path(path, superClipRect, &superBlit, ir.fTop, ir.fBottom, SHIFT, clip);
+        sk_fill_path(path, superClipRect, &superBlit, ir.fTop, ir.fBottom, SHIFT, *clipRgn);
     } else {
-        SuperBlitter    superBlit(blitter, ir, clip);
-        sk_fill_path(path, superClipRect, &superBlit, ir.fTop, ir.fBottom, SHIFT, clip);
+        SuperBlitter    superBlit(blitter, ir, *clipRgn);
+        sk_fill_path(path, superClipRect, &superBlit, ir.fTop, ir.fBottom, SHIFT, *clipRgn);
     }
 
     if (path.isInverseFillType()) {
-        sk_blit_below(blitter, ir, clip);
+        sk_blit_below(blitter, ir, *clipRgn);
     }
 }
 
diff --git a/src/core/SkScan_Path.cpp b/src/core/SkScan_Path.cpp
index 7a8aa75..5b92ff9 100644
--- a/src/core/SkScan_Path.cpp
+++ b/src/core/SkScan_Path.cpp
@@ -131,7 +131,7 @@
         while (currE->fFirstY <= curr_y) {
             SkASSERT(currE->fLastY >= curr_y);
 
-            int x = (currE->fX + SK_Fixed1/2) >> 16;
+            int x = SkFixedRoundToInt(currE->fX);
             w += currE->fWinding;
             if ((w & windingMask) == 0) { // we finished an interval
                 SkASSERT(in_interval);
@@ -256,8 +256,8 @@
         int count = local_bot - local_top;
         SkASSERT(count >= 0);
         if (0 == (dLeft | dRite)) {
-            int L = (left + SK_Fixed1/2) >> 16;
-            int R = (rite + SK_Fixed1/2) >> 16;
+            int L = SkFixedRoundToInt(left);
+            int R = SkFixedRoundToInt(rite);
             if (L < R) {
                 count += 1;
                 blitter->blitRect(L, local_top, R - L, count);
@@ -267,8 +267,8 @@
             local_top = local_bot + 1;
         } else {
             do {
-                int L = (left + SK_Fixed1/2) >> 16;
-                int R = (rite + SK_Fixed1/2) >> 16;
+                int L = SkFixedRoundToInt(left);
+                int R = SkFixedRoundToInt(rite);
                 if (L < R) {
                     blitter->blitH(L, local_top, R - L);
                 }
diff --git a/src/core/SkStroke.cpp b/src/core/SkStroke.cpp
index 8816de5..4b486b1 100644
--- a/src/core/SkStroke.cpp
+++ b/src/core/SkStroke.cpp
@@ -472,11 +472,11 @@
 ///////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////
 
-#include "SkPaint.h"
+#include "SkPaintDefaults.h"
 
 SkStroke::SkStroke() {
-    fWidth      = SK_DefaultStrokeWidth;
-    fMiterLimit = SK_DefaultMiterLimit;
+    fWidth      = SK_Scalar1;
+    fMiterLimit = SkPaintDefaults_MiterLimit;
     fCap        = SkPaint::kDefault_Cap;
     fJoin       = SkPaint::kDefault_Join;
     fDoFill     = false;
diff --git a/src/core/SkTextFormatParams.h b/src/core/SkTextFormatParams.h
index c2767f9..dac4aef 100644
--- a/src/core/SkTextFormatParams.h
+++ b/src/core/SkTextFormatParams.h
@@ -25,8 +25,8 @@
 // At 36 points and above, it is increased by text size / 32.  In between,
 // it is interpolated between those values.
 static const SkScalar kStdFakeBoldInterpKeys[] = {
-    SkIntToScalar(9),
-    SkIntToScalar(36)
+    SK_Scalar1*9,
+    SK_Scalar1*36,
 };
 static const SkScalar kStdFakeBoldInterpValues[] = {
     SK_Scalar1/24,
diff --git a/src/core/SkTypefaceCache.cpp b/src/core/SkTypefaceCache.cpp
index 5a0b36c..f4397a6 100644
--- a/src/core/SkTypefaceCache.cpp
+++ b/src/core/SkTypefaceCache.cpp
@@ -82,7 +82,7 @@
     return sk_atomic_inc(&gFontID) + 1;
 }
 
-static SkMutex gMutex;
+SK_DECLARE_STATIC_MUTEX(gMutex);
 
 void SkTypefaceCache::Add(SkTypeface* face, SkTypeface::Style requestedStyle) {
     SkAutoMutexAcquire ama(gMutex);
@@ -94,12 +94,15 @@
     return Get().findByID(fontID);
 }
 
-SkTypeface* SkTypefaceCache::FindByProc(FindProc proc, void* ctx) {
+SkTypeface* SkTypefaceCache::FindByProcAndRef(FindProc proc, void* ctx) {
     SkAutoMutexAcquire ama(gMutex);
-    return Get().findByProc(proc, ctx);
+    SkTypeface* typeface = Get().findByProc(proc, ctx);
+    SkSafeRef(typeface);
+    return typeface;
 }
 
 void SkTypefaceCache::PurgeAll() {
+    SkAutoMutexAcquire ama(gMutex);
     Get().purgeAll();
 }
 
diff --git a/src/core/SkTypefaceCache.h b/src/core/SkTypefaceCache.h
index b788d21..e65ec90 100644
--- a/src/core/SkTypefaceCache.h
+++ b/src/core/SkTypefaceCache.h
@@ -23,6 +23,11 @@
 
 class SkTypefaceCache {
 public:
+    /**
+     * Callback for FindByProc. Returns true if the given typeface is a match
+     * for the given context. The passed typeface is owned by the cache and is
+     * not additionally ref()ed.
+     */
     typedef bool (*FindProc)(SkTypeface*, SkTypeface::Style, void* context);
 
     /**
@@ -33,29 +38,32 @@
 
     /**
      *  Add a typeface to the cache. This ref()s the typeface, so that the
-     *  cache is also an owner. Later, if we need to purge the cache, it will
-     *  unref() typefaces whose refcnt is 1 (meaning only the cache is an owner).
+     *  cache is also an owner. Later, if we need to purge the cache, typefaces
+     *  whose refcnt is 1 (meaning only the cache is an owner) will be
+     *  unref()ed.
      */
     static void Add(SkTypeface*, SkTypeface::Style requested);
 
     /**
      *  Search the cache for a typeface with the specified fontID (uniqueID).
      *  If one is found, return it (its reference count is unmodified). If none
-     *  is found, return NULL.
+     *  is found, return NULL. The reference count is unmodified as it is
+     *  assumed that the stack will contain a ref to the typeface.
      */
     static SkTypeface* FindByID(SkFontID fontID);
 
     /**
      *  Iterate through the cache, calling proc(typeface, ctx) with each
-     *  typeface. If proc returns true, then we return that typeface (its
-     *  reference count is unmodified). If it never returns true, we return NULL.
+     *  typeface. If proc returns true, then we return that typeface (this
+     *  ref()s the typeface). If it never returns true, we return NULL.
      */
-    static SkTypeface* FindByProc(FindProc proc, void* ctx);
+    static SkTypeface* FindByProcAndRef(FindProc proc, void* ctx);
 
     /**
-     *  This will unref all of the typefaces in the cache. Normally this is
-     *  handled automatically as needed. This function is exposed for clients
-     *  that explicitly want to purge the entire cache (e.g. to look for leaks).
+     *  This will unref all of the typefaces in the cache for which the cache
+     *  is the only owner. Normally this is handled automatically as needed.
+     *  This function is exposed for clients that explicitly want to purge the
+     *  cache (e.g. to look for leaks).
      */
     static void PurgeAll();
 
diff --git a/src/core/SkXfermode.cpp b/src/core/SkXfermode.cpp
index fdf1798..efedda4 100644
--- a/src/core/SkXfermode.cpp
+++ b/src/core/SkXfermode.cpp
@@ -12,17 +12,6 @@
 
 #define SkAlphaMulAlpha(a, b)   SkMulDiv255Round(a, b)
 
-static SkPMColor SkFourByteInterp(SkPMColor src, SkPMColor dst, U8CPU alpha) {
-    unsigned scale = SkAlpha255To256(alpha);
-
-    unsigned a = SkAlphaBlend(SkGetPackedA32(src), SkGetPackedA32(dst), scale);
-    unsigned r = SkAlphaBlend(SkGetPackedR32(src), SkGetPackedR32(dst), scale);
-    unsigned g = SkAlphaBlend(SkGetPackedG32(src), SkGetPackedG32(dst), scale);
-    unsigned b = SkAlphaBlend(SkGetPackedB32(src), SkGetPackedB32(dst), scale);
-
-    return SkPackARGB32(a, r, g, b);
-}
-
 #if 0
 // idea for higher precision blends in xfer procs (and slightly faster)
 // see DstATop as a probable caller
@@ -755,6 +744,12 @@
             : INHERITED(buffer) {
         fMode = (SkXfermode::Mode)buffer.readU32();
 
+        if (buffer.getPictureVersion() == PICTURE_VERSION_ICS) {
+            fSrcCoeff = (Coeff)buffer.readU32();
+            fDstCoeff = (Coeff)buffer.readU32();
+            return;
+        }
+
         const ProcCoeff& rec = gProcCoeffs[fMode];
         // these may be valid, or may be CANNOT_USE_COEFF
         fSrcCoeff = rec.fSC;
diff --git a/src/effects/SkAvoidXfermode.cpp b/src/effects/SkAvoidXfermode.cpp
index d2cd49b..9f4e396 100644
--- a/src/effects/SkAvoidXfermode.cpp
+++ b/src/effects/SkAvoidXfermode.cpp
@@ -98,16 +98,6 @@
     return result;
 }
 
-static SkPMColor SkFourByteInterp(SkPMColor src, SkPMColor dst, unsigned scale)
-{
-    unsigned a = SkAlphaBlend(SkGetPackedA32(src), SkGetPackedA32(dst), scale);
-    unsigned r = SkAlphaBlend(SkGetPackedR32(src), SkGetPackedR32(dst), scale);
-    unsigned g = SkAlphaBlend(SkGetPackedG32(src), SkGetPackedG32(dst), scale);
-    unsigned b = SkAlphaBlend(SkGetPackedB32(src), SkGetPackedB32(dst), scale);
-
-    return SkPackARGB32(a, r, g, b);
-}
-
 static inline unsigned Accurate255To256(unsigned x) {
     return x + (x >> 7);
 }
diff --git a/src/effects/SkGradientShader.cpp b/src/effects/SkGradientShader.cpp
index a907d04..4bbc96c 100644
--- a/src/effects/SkGradientShader.cpp
+++ b/src/effects/SkGradientShader.cpp
@@ -16,10 +16,6 @@
 #include "SkTemplates.h"
 #include "SkBitmapCache.h"
 
-#if defined(SK_SCALAR_IS_FLOAT) && !defined(SK_DONT_USE_FLOAT_SQRT)
-    #define SK_USE_FLOAT_SQRT
-#endif
-
 #ifndef SK_DISABLE_DITHER_32BIT_GRADIENT
     #define USE_DITHER_32BIT_GRADIENT
 #endif
@@ -53,35 +49,39 @@
     return retval;
 }
 
-///////////////////////////////////////////////////////////////////////////////
-
-typedef SkFixed (*TileProc)(SkFixed);
+//  Clamp
 
 static SkFixed clamp_tileproc(SkFixed x) {
     return SkClampMax(x, 0xFFFF);
 }
 
+// Repeat
+
 static SkFixed repeat_tileproc(SkFixed x) {
     return x & 0xFFFF;
 }
 
+static inline int repeat_bits(int x, const int bits) {
+    return x & ((1 << bits) - 1);
+}
+
+static inline int repeat_8bits(int x) {
+    return x & 0xFF;
+}
+
+// Mirror
+
+// Visual Studio 2010 (MSC_VER=1600) optimizes bit-shift code incorrectly.
+// See http://code.google.com/p/skia/issues/detail?id=472
+#if defined(_MSC_VER) && (_MSC_VER >= 1600)
+#pragma optimize("", off)
+#endif
+
 static inline SkFixed mirror_tileproc(SkFixed x) {
     int s = x << 15 >> 31;
     return (x ^ s) & 0xFFFF;
 }
 
-static const TileProc gTileProcs[] = {
-    clamp_tileproc,
-    repeat_tileproc,
-    mirror_tileproc
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-static inline int repeat_bits(int x, const int bits) {
-    return x & ((1 << bits) - 1);
-}
-
 static inline int mirror_bits(int x, const int bits) {
 #ifdef SK_CPU_HAS_CONDITIONAL_INSTR
     if (x & (1 << bits))
@@ -93,10 +93,6 @@
 #endif
 }
 
-static inline int repeat_8bits(int x) {
-    return x & 0xFF;
-}
-
 static inline int mirror_8bits(int x) {
 #ifdef SK_CPU_HAS_CONDITIONAL_INSTR
     if (x & 256) {
@@ -109,6 +105,21 @@
 #endif
 }
 
+#if defined(_MSC_VER) && (_MSC_VER >= 1600)
+#pragma optimize("", on)
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+typedef SkFixed (*TileProc)(SkFixed);
+
+static const TileProc gTileProcs[] = {
+    clamp_tileproc,
+    repeat_tileproc,
+    mirror_tileproc
+};
+
+///////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////
 
 class Gradient_Shader : public SkShader {
@@ -123,23 +134,41 @@
     virtual bool isOpaque() const SK_OVERRIDE;
 
     enum {
-        kCache16Bits    = 8,    // seems like enough for visual accuracy
-        kCache16Count   = 1 << kCache16Bits,
-        kCache16Mask    = kCache16Count - 1,
+        /// Seems like enough for visual accuracy. TODO: if pos[] deserves
+        /// it, use a larger cache.
+        kCache16Bits    = 8,
+        kGradient16Length = (1 << kCache16Bits),
+        /// Each cache gets 1 extra entry at the end so we don't have to
+        /// test for end-of-cache in lerps. This is also the value used
+        /// to stride *writes* into the dither cache; it must not be zero.
+        /// Total space for a cache is 2x kCache16Count entries: one
+        /// regular cache, one for dithering.
+        kCache16Count   = kGradient16Length + 1,
         kCache16Shift   = 16 - kCache16Bits,
         kSqrt16Shift    = 8 - kCache16Bits,
 
-        kCache32Bits    = 8,    // pretty much should always be 8
-        kCache32Count   = 1 << kCache32Bits,
-        kCache32Mask    = kCache32Count - 1,
+        /// Seems like enough for visual accuracy. TODO: if pos[] deserves
+        /// it, use a larger cache.
+        kCache32Bits    = 8,
+        kGradient32Length = (1 << kCache32Bits),
+        /// Each cache gets 1 extra entry at the end so we don't have to
+        /// test for end-of-cache in lerps. This is also the value used
+        /// to stride *writes* into the dither cache; it must not be zero.
+        /// Total space for a cache is 2x kCache32Count entries: one
+        /// regular cache, one for dithering.
+        kCache32Count   = kGradient32Length + 1,
         kCache32Shift   = 16 - kCache32Bits,
         kSqrt32Shift    = 8 - kCache32Bits,
+
+        /// This value is used to *read* the dither cache; it may be 0
+        /// if dithering is disabled.
 #ifdef USE_DITHER_32BIT_GRADIENT
-        kToggleMask32 = kCache32Count,
+        kDitherStride32 = kCache32Count,
 #else
-        kToggleMask32 = 0,
+        kDitherStride32 = 0,
 #endif
-        kToggleMask16 = kCache16Count
+        kDitherStride16 = kCache16Count,
+        kLerpRemainderMask32 = (1 << (16 - kCache32Bits)) - 1
     };
 
 
@@ -556,10 +585,11 @@
 
     do {
         cache[0] = SkPremultiplyARGBInline(a >> 16, r >> 16, g >> 16, b >> 16);
-        cache[kCache32Count] = SkPremultiplyARGBInline(dither_ceil_fixed_to_8(a),
-                                                       dither_fixed_to_8(r),
-                                                       dither_fixed_to_8(g),
-                                                       dither_fixed_to_8(b));
+        cache[kCache32Count] =
+            SkPremultiplyARGBInline(dither_ceil_fixed_to_8(a),
+                                    dither_fixed_to_8(r),
+                                    dither_fixed_to_8(g),
+                                    dither_fixed_to_8(b));
         cache += 1;
         a += da;
         r += dr;
@@ -585,6 +615,14 @@
     return 0;
 }
 
+/** We duplicate the last value in each half of the cache so that
+    interpolation doesn't have to special-case being at the last point.
+*/
+static void complete_16bit_cache(uint16_t* cache, int stride) {
+    cache[stride - 1] = cache[stride - 2];
+    cache[2 * stride - 1] = cache[2 * stride - 2];
+}
+
 const uint16_t* Gradient_Shader::getCache16() const {
     if (fCache16 == NULL) {
         // double the count for dither entries
@@ -596,7 +634,8 @@
         }
         fCache16 = fCache16Storage;
         if (fColorCount == 2) {
-            Build16bitCache(fCache16, fOrigColors[0], fOrigColors[1], kCache16Count);
+            Build16bitCache(fCache16, fOrigColors[0], fOrigColors[1],
+                            kGradient16Length);
         } else {
             Rec* rec = fRecs;
             int prevIndex = 0;
@@ -608,7 +647,8 @@
                     Build16bitCache(fCache16 + prevIndex, fOrigColors[i-1], fOrigColors[i], nextIndex - prevIndex + 1);
                 prevIndex = nextIndex;
             }
-            SkASSERT(prevIndex == kCache16Count - 1);
+            // one extra space left over at the end for complete_16bit_cache()
+            SkASSERT(prevIndex == kGradient16Length - 1);
         }
 
         if (fMapper) {
@@ -616,7 +656,7 @@
             uint16_t* linear = fCache16;         // just computed linear data
             uint16_t* mapped = fCache16Storage;  // storage for mapped data
             SkUnitMapper* map = fMapper;
-            for (int i = 0; i < kCache16Count; i++) {
+            for (int i = 0; i < kGradient16Length; i++) {
                 int index = map->mapUnit16(bitsTo16(i, kCache16Bits)) >> kCache16Shift;
                 mapped[i] = linear[index];
                 mapped[i + kCache16Count] = linear[index + kCache16Count];
@@ -624,10 +664,19 @@
             sk_free(fCache16);
             fCache16 = fCache16Storage;
         }
+        complete_16bit_cache(fCache16, kCache16Count);
     }
     return fCache16;
 }
 
+/** We duplicate the last value in each half of the cache so that
+    interpolation doesn't have to special-case being at the last point.
+*/
+static void complete_32bit_cache(SkPMColor* cache, int stride) {
+    cache[stride - 1] = cache[stride - 2];
+    cache[2 * stride - 1] = cache[2 * stride - 2];
+}
+
 const SkPMColor* Gradient_Shader::getCache32() const {
     if (fCache32 == NULL) {
         // double the count for dither entries
@@ -641,13 +690,13 @@
         fCache32 = (SkPMColor*)fCache32PixelRef->getAddr();
         if (fColorCount == 2) {
             Build32bitCache(fCache32, fOrigColors[0], fOrigColors[1],
-                            kCache32Count, fCacheAlpha);
+                            kGradient32Length, fCacheAlpha);
         } else {
             Rec* rec = fRecs;
             int prevIndex = 0;
             for (int i = 1; i < fColorCount; i++) {
                 int nextIndex = SkFixedToFFFF(rec[i].fPos) >> kCache32Shift;
-                SkASSERT(nextIndex < kCache32Count);
+                SkASSERT(nextIndex < kGradient32Length);
 
                 if (nextIndex > prevIndex)
                     Build32bitCache(fCache32 + prevIndex, fOrigColors[i-1],
@@ -655,7 +704,7 @@
                                     nextIndex - prevIndex + 1, fCacheAlpha);
                 prevIndex = nextIndex;
             }
-            SkASSERT(prevIndex == kCache32Count - 1);
+            SkASSERT(prevIndex == kGradient32Length - 1);
         }
 
         if (fMapper) {
@@ -664,7 +713,7 @@
             SkPMColor* linear = fCache32;           // just computed linear data
             SkPMColor* mapped = (SkPMColor*)newPR->getAddr();    // storage for mapped data
             SkUnitMapper* map = fMapper;
-            for (int i = 0; i < kCache32Count; i++) {
+            for (int i = 0; i < kGradient32Length; i++) {
                 int index = map->mapUnit16((i << 8) | i) >> 8;
                 mapped[i] = linear[index];
                 mapped[i + kCache32Count] = linear[index + kCache32Count];
@@ -673,6 +722,7 @@
             fCache32PixelRef = newPR;
             fCache32 = (SkPMColor*)newPR->getAddr();
         }
+        complete_32bit_cache(fCache32, kCache32Count);
     }
     return fCache32;
 }
@@ -694,7 +744,7 @@
     if (fMapper) {
         // force our cahce32pixelref to be built
         (void)this->getCache32();
-        bitmap->setConfig(SkBitmap::kARGB_8888_Config, kCache32Count, 1);
+        bitmap->setConfig(SkBitmap::kARGB_8888_Config, kGradient32Length, 1);
         bitmap->setPixelRef(fCache32PixelRef);
         return;
     }
@@ -720,7 +770,7 @@
 
     ///////////////////////////////////
 
-    static SkMutex gMutex;
+    SK_DECLARE_STATIC_MUTEX(gMutex);
     static SkBitmapCache* gCache;
     // each cache cost 1K of RAM, since each bitmap will be 1x256 at 32bpp
     static const int MAX_NUM_CACHED_GRADIENT_BITMAPS = 32;
@@ -734,7 +784,9 @@
     if (!gCache->find(storage.get(), size, bitmap)) {
         // force our cahce32pixelref to be built
         (void)this->getCache32();
-        bitmap->setConfig(SkBitmap::kARGB_8888_Config, kCache32Count, 1);
+        // Only expose the linear section of the cache; don't let the caller
+        // know about the padding at the end to make interpolation faster.
+        bitmap->setConfig(SkBitmap::kARGB_8888_Config, kGradient32Length, 1);
         bitmap->setPixelRef(fCache32PixelRef);
 
         gCache->add(storage.get(), size, *bitmap);
@@ -850,25 +902,54 @@
     SkASSERT(fi <= 0xFF);           \
     fx += dx;                       \
     *dstC++ = cache[toggle + fi];   \
-    toggle ^= Gradient_Shader::kToggleMask32; \
+    toggle ^= Gradient_Shader::kDitherStride32; \
     } while (0)
 
 namespace {
 
 typedef void (*LinearShadeProc)(TileProc proc, SkFixed dx, SkFixed fx,
-                                SkPMColor* SK_RESTRICT dstC,
-                                const SkPMColor* SK_RESTRICT cache,
+                                SkPMColor* dstC, const SkPMColor* cache,
                                 int toggle, int count);
 
+// This function is deprecated, and will be replaced by 
+// shadeSpan_linear_vertical_lerp() once Chrome has been weaned off of it.
 void shadeSpan_linear_vertical(TileProc proc, SkFixed dx, SkFixed fx,
                                SkPMColor* SK_RESTRICT dstC,
                                const SkPMColor* SK_RESTRICT cache,
                                int toggle, int count) {
-    // we're a vertical gradient, so no change in a span
-    unsigned fi = proc(fx) >> Gradient_Shader::kCache32Shift;
-    sk_memset32_dither(dstC, cache[toggle + fi],
-        cache[(toggle ^ Gradient_Shader::kToggleMask32) + fi], count);
+    // We're a vertical gradient, so no change in a span.
+    // If colors change sharply across the gradient, dithering is
+    // insufficient (it subsamples the color space) and we need to lerp.
+    unsigned fullIndex = proc(fx);
+    unsigned fi = fullIndex >> (16 - Gradient_Shader::kCache32Bits);
+    sk_memset32_dither(dstC,
+            cache[toggle + fi],
+            cache[(toggle ^ Gradient_Shader::kDitherStride32) + fi],
+            count);
+}
 
+// Linear interpolation (lerp) is unnecessary if there are no sharp
+// discontinuities in the gradient - which must be true if there are
+// only 2 colors - but it's cheap.
+void shadeSpan_linear_vertical_lerp(TileProc proc, SkFixed dx, SkFixed fx,
+                                    SkPMColor* SK_RESTRICT dstC,
+                                    const SkPMColor* SK_RESTRICT cache,
+                                    int toggle, int count) {
+    // We're a vertical gradient, so no change in a span.
+    // If colors change sharply across the gradient, dithering is
+    // insufficient (it subsamples the color space) and we need to lerp.
+    unsigned fullIndex = proc(fx);
+    unsigned fi = fullIndex >> (16 - Gradient_Shader::kCache32Bits);
+    unsigned remainder = fullIndex & Gradient_Shader::kLerpRemainderMask32;
+    SkPMColor lerp =
+        SkFastFourByteInterp(
+            cache[toggle + fi + 1],
+            cache[toggle + fi], remainder);
+    SkPMColor dlerp =
+        SkFastFourByteInterp(
+            cache[(toggle ^ Gradient_Shader::kDitherStride32) + fi + 1],
+            cache[(toggle ^ Gradient_Shader::kDitherStride32) + fi], remainder);
+    sk_memset32_dither(dstC, lerp, dlerp, count);
 }
 
 void shadeSpan_linear_clamp(TileProc proc, SkFixed dx, SkFixed fx,
@@ -876,12 +957,12 @@
                             const SkPMColor* SK_RESTRICT cache,
                             int toggle, int count) {
     SkClampRange range;
-    range.init(fx, dx, count, 0, 0xFF);
+    range.init(fx, dx, count, 0, Gradient_Shader::kGradient32Length);
 
     if ((count = range.fCount0) > 0) {
         sk_memset32_dither(dstC,
             cache[toggle + range.fV0],
-            cache[(toggle ^ Gradient_Shader::kToggleMask32) + range.fV0],
+            cache[(toggle ^ Gradient_Shader::kDitherStride32) + range.fV0],
             count);
         dstC += count;
     }
@@ -903,13 +984,11 @@
     if ((count = range.fCount2) > 0) {
         sk_memset32_dither(dstC,
             cache[toggle + range.fV1],
-            cache[(toggle ^ Gradient_Shader::kToggleMask32) + range.fV1],
+            cache[(toggle ^ Gradient_Shader::kDitherStride32) + range.fV1],
             count);
     }
 }
 
-// TODO: we could merge mirror and repeat if we passed in a pointer to the
-// *_8bits proc, but that'd lose inlining, which might be significant here.
 void shadeSpan_linear_mirror(TileProc proc, SkFixed dx, SkFixed fx,
                              SkPMColor* SK_RESTRICT dstC,
                              const SkPMColor* SK_RESTRICT cache,
@@ -919,7 +998,7 @@
         SkASSERT(fi <= 0xFF);
         fx += dx;
         *dstC++ = cache[toggle + fi];
-        toggle ^= Gradient_Shader::kToggleMask32;
+        toggle ^= Gradient_Shader::kDitherStride32;
     } while (--count != 0);
 }
 
@@ -932,9 +1011,10 @@
         SkASSERT(fi <= 0xFF);
         fx += dx;
         *dstC++ = cache[toggle + fi];
-        toggle ^= Gradient_Shader::kToggleMask32;
+        toggle ^= Gradient_Shader::kDitherStride32;
     } while (--count != 0);
 }
+
 }
 
 void Linear_Gradient::shadeSpan(int x, int y, SkPMColor* SK_RESTRICT dstC,
@@ -946,7 +1026,7 @@
     TileProc            proc = fTileProc;
     const SkPMColor* SK_RESTRICT cache = this->getCache32();
 #ifdef USE_DITHER_32BIT_GRADIENT
-    int                 toggle = ((x ^ y) & 1) << kCache32Bits;
+    int                 toggle = ((x ^ y) & 1) * kDitherStride32;
 #else
     int toggle = 0;
 #endif
@@ -967,7 +1047,15 @@
 
         LinearShadeProc shadeProc = shadeSpan_linear_repeat;
         if (SkFixedNearlyZero(dx)) {
-            shadeProc = shadeSpan_linear_vertical;
+#ifdef SK_SIMPLE_TWOCOLOR_VERTICAL_GRADIENTS
+            if (fColorCount > 2) {
+                shadeProc = shadeSpan_linear_vertical_lerp;
+            } else {
+                shadeProc = shadeSpan_linear_vertical;
+            }
+#else
+            shadeProc = shadeSpan_linear_vertical_lerp;
+#endif
         } else if (proc == clamp_tileproc) {
             shadeProc = shadeSpan_linear_clamp;
         } else if (proc == mirror_tileproc) {
@@ -984,7 +1072,7 @@
             unsigned fi = proc(SkScalarToFixed(srcPt.fX));
             SkASSERT(fi <= 0xFFFF);
             *dstC++ = cache[toggle + (fi >> kCache32Shift)];
-            toggle ^= Gradient_Shader::kToggleMask32;
+            toggle ^= Gradient_Shader::kDitherStride32;
             dstX += SK_Scalar1;
         } while (--count != 0);
     }
@@ -998,7 +1086,7 @@
         this->commonAsABitmap(bitmap);
     }
     if (matrix) {
-        matrix->setScale(SkIntToScalar(kCache32Count), SK_Scalar1);
+        matrix->setScale(SkIntToScalar(kGradient32Length), SK_Scalar1);
         matrix->preConcat(fPtsToUnit);
     }
     if (xy) {
@@ -1035,17 +1123,16 @@
 #define NO_CHECK_ITER_16                \
     do {                                \
     unsigned fi = fx >> Gradient_Shader::kCache16Shift;  \
-    SkASSERT(fi <= Gradient_Shader::kCache16Mask);       \
+    SkASSERT(fi < Gradient_Shader::kCache16Count);       \
     fx += dx;                           \
     *dstC++ = cache[toggle + fi];       \
-    toggle ^= Gradient_Shader::kToggleMask16;            \
+    toggle ^= Gradient_Shader::kDitherStride16;            \
     } while (0)
 
 namespace {
 
 typedef void (*LinearShade16Proc)(TileProc proc, SkFixed dx, SkFixed fx,
-                                  uint16_t* SK_RESTRICT dstC,
-                                  const uint16_t* SK_RESTRICT cache,
+                                  uint16_t* dstC, const uint16_t* cache,
                                   int toggle, int count);
 
 void shadeSpan16_linear_vertical(TileProc proc, SkFixed dx, SkFixed fx,
@@ -1054,9 +1141,9 @@
                                  int toggle, int count) {
     // we're a vertical gradient, so no change in a span
     unsigned fi = proc(fx) >> Gradient_Shader::kCache16Shift;
-    SkASSERT(fi <= Gradient_Shader::kCache16Mask);
+    SkASSERT(fi < Gradient_Shader::kCache16Count);
     dither_memset16(dstC, cache[toggle + fi],
-        cache[(toggle ^ Gradient_Shader::kToggleMask16) + fi], count);
+        cache[(toggle ^ Gradient_Shader::kDitherStride16) + fi], count);
 
 }
 
@@ -1065,12 +1152,12 @@
                               const uint16_t* SK_RESTRICT cache,
                               int toggle, int count) {
     SkClampRange range;
-    range.init(fx, dx, count, 0, Gradient_Shader::kCache16Mask);
+    range.init(fx, dx, count, 0, Gradient_Shader::kGradient16Length);
 
     if ((count = range.fCount0) > 0) {
         dither_memset16(dstC,
             cache[toggle + range.fV0],
-            cache[(toggle ^ Gradient_Shader::kToggleMask16) + range.fV0],
+            cache[(toggle ^ Gradient_Shader::kDitherStride16) + range.fV0],
             count);
         dstC += count;
     }
@@ -1092,7 +1179,7 @@
     if ((count = range.fCount2) > 0) {
         dither_memset16(dstC,
             cache[toggle + range.fV1],
-            cache[(toggle ^ Gradient_Shader::kToggleMask16) + range.fV1],
+            cache[(toggle ^ Gradient_Shader::kDitherStride16) + range.fV1],
             count);
     }
 }
@@ -1104,10 +1191,10 @@
     do {
         unsigned fi = mirror_bits(fx >> Gradient_Shader::kCache16Shift,
                                         Gradient_Shader::kCache16Bits);
-        SkASSERT(fi <= Gradient_Shader::kCache16Mask);
+        SkASSERT(fi < Gradient_Shader::kCache16Count);
         fx += dx;
         *dstC++ = cache[toggle + fi];
-        toggle ^= Gradient_Shader::kToggleMask16;
+        toggle ^= Gradient_Shader::kDitherStride16;
     } while (--count != 0);
 }
 
@@ -1119,10 +1206,10 @@
     do {
         unsigned fi = repeat_bits(fx >> Gradient_Shader::kCache16Shift,
                                   Gradient_Shader::kCache16Bits);
-        SkASSERT(fi <= Gradient_Shader::kCache16Mask);
+        SkASSERT(fi < Gradient_Shader::kCache16Count);
         fx += dx;
         *dstC++ = cache[toggle + fi];
-        toggle ^= Gradient_Shader::kToggleMask16;
+        toggle ^= Gradient_Shader::kDitherStride16;
     } while (--count != 0);
 }
 }
@@ -1135,7 +1222,7 @@
     SkMatrix::MapXYProc dstProc = fDstToIndexProc;
     TileProc            proc = fTileProc;
     const uint16_t* SK_RESTRICT cache = this->getCache16();
-    int                 toggle = ((x ^ y) & 1) << kCache16Bits;
+    int                 toggle = ((x ^ y) & 1) * kDitherStride16;
 
     if (fDstToIndexClass != kPerspective_MatrixClass) {
         dstProc(fDstToIndex, SkIntToScalar(x) + SK_ScalarHalf,
@@ -1172,7 +1259,7 @@
 
             int index = fi >> kCache16Shift;
             *dstC++ = cache[toggle + index];
-            toggle ^= Gradient_Shader::kToggleMask16;
+            toggle ^= Gradient_Shader::kDitherStride16;
 
             dstX += SK_Scalar1;
         } while (--count != 0);
@@ -1230,14 +1317,14 @@
 
 namespace {
 
-typedef void (* RadialShade16Proc)(SkFixed fx, SkFixed dx,
-        SkFixed fy, SkFixed dy,
-        uint16_t* dstC, const uint16_t* SK_RESTRICT cache,
+typedef void (* RadialShade16Proc)(SkScalar sfx, SkScalar sdx,
+        SkScalar sfy, SkScalar sdy,
+        uint16_t* dstC, const uint16_t* cache,
         int toggle, int count);
 
-void shadeSpan16_radial_clamp(SkFixed fx, SkFixed dx,
-        SkFixed fy, SkFixed dy,
-        uint16_t* dstC, const uint16_t* SK_RESTRICT cache,
+void shadeSpan16_radial_clamp(SkScalar sfx, SkScalar sdx,
+        SkScalar sfy, SkScalar sdy,
+        uint16_t* SK_RESTRICT dstC, const uint16_t* SK_RESTRICT cache,
         int toggle, int count) {
     const uint8_t* SK_RESTRICT sqrt_table = gSqrt8Table;
 
@@ -1248,10 +1335,10 @@
        we could (it seems) put this scale-down into fDstToIndex,
        to avoid having to do these extra shifts each time.
     */
-    fx >>= 1;
-    dx >>= 1;
-    fy >>= 1;
-    dy >>= 1;
+    SkFixed fx = SkScalarToFixed(sfx) >> 1;
+    SkFixed dx = SkScalarToFixed(sdx) >> 1;
+    SkFixed fy = SkScalarToFixed(sfy) >> 1;
+    SkFixed dy = SkScalarToFixed(sdy) >> 1;
     // might perform this check for the other modes,
     // but the win will be a smaller % of the total
     if (dy == 0) {
@@ -1264,7 +1351,7 @@
             fx += dx;
             *dstC++ = cache[toggle +
                             (sqrt_table[fi] >> Gradient_Shader::kSqrt16Shift)];
-            toggle ^= Gradient_Shader::kToggleMask16;
+            toggle ^= Gradient_Shader::kDitherStride16;
         } while (--count != 0);
     } else {
         do {
@@ -1276,28 +1363,43 @@
             fy += dy;
             *dstC++ = cache[toggle +
                             (sqrt_table[fi] >> Gradient_Shader::kSqrt16Shift)];
-            toggle ^= Gradient_Shader::kToggleMask16;
+            toggle ^= Gradient_Shader::kDitherStride16;
         } while (--count != 0);
     }
 }
 
-void shadeSpan16_radial_mirror(SkFixed fx, SkFixed dx, SkFixed fy, SkFixed dy,
-        uint16_t* dstC, const uint16_t* SK_RESTRICT cache,
+void shadeSpan16_radial_mirror(SkScalar sfx, SkScalar sdx,
+        SkScalar sfy, SkScalar sdy,
+        uint16_t* SK_RESTRICT dstC, const uint16_t* SK_RESTRICT cache,
         int toggle, int count) {
     do {
-        SkFixed dist = SkFixedSqrt(SkFixedSquare(fx) + SkFixedSquare(fy));
+#ifdef SK_SCALAR_IS_FLOAT
+        float fdist = sk_float_sqrt(sfx*sfx + sfy*sfy);
+        SkFixed dist = SkFloatToFixed(fdist);
+#else
+        SkFixed magnitudeSquared = SkFixedSquare(sfx) +
+            SkFixedSquare(sfy);
+        if (magnitudeSquared < 0) // Overflow.
+            magnitudeSquared = SK_FixedMax;
+        SkFixed dist = SkFixedSqrt(magnitudeSquared);
+#endif
         unsigned fi = mirror_tileproc(dist);
         SkASSERT(fi <= 0xFFFF);
-        fx += dx;
-        fy += dy;
         *dstC++ = cache[toggle + (fi >> Gradient_Shader::kCache16Shift)];
-        toggle ^= Gradient_Shader::kToggleMask16;
+        toggle ^= Gradient_Shader::kDitherStride16;
+        sfx += sdx;
+        sfy += sdy;
     } while (--count != 0);
 }
 
-void shadeSpan16_radial_repeat(SkFixed fx, SkFixed dx, SkFixed fy, SkFixed dy,
-        uint16_t* dstC, const uint16_t* SK_RESTRICT cache,
+void shadeSpan16_radial_repeat(SkScalar sfx, SkScalar sdx,
+        SkScalar sfy, SkScalar sdy,
+        uint16_t* SK_RESTRICT dstC, const uint16_t* SK_RESTRICT cache,
         int toggle, int count) {
+    SkFixed fx = SkScalarToFixed(sfx);
+    SkFixed dx = SkScalarToFixed(sdx);
+    SkFixed fy = SkScalarToFixed(sfy);
+    SkFixed dy = SkScalarToFixed(sdy);
     do {
         SkFixed dist = SkFixedSqrt(SkFixedSquare(fx) + SkFixedSquare(fy));
         unsigned fi = repeat_tileproc(dist);
@@ -1305,7 +1407,7 @@
         fx += dx;
         fy += dy;
         *dstC++ = cache[toggle + (fi >> Gradient_Shader::kCache16Shift)];
-        toggle ^= Gradient_Shader::kToggleMask16;
+        toggle ^= Gradient_Shader::kDitherStride16;
     } while (--count != 0);
 }
 
@@ -1326,31 +1428,33 @@
         rad_to_unit_matrix(center, radius, &fPtsToUnit);
     }
 
-    virtual void shadeSpan(int x, int y, SkPMColor* dstC, int count) SK_OVERRIDE;
-    virtual void shadeSpan16(int x, int y, uint16_t* SK_RESTRICT dstC, int count) SK_OVERRIDE {
+    virtual void shadeSpan(int x, int y, SkPMColor* dstC, int count)
+        SK_OVERRIDE;
+    virtual void shadeSpan16(int x, int y, uint16_t* SK_RESTRICT dstC,
+                             int count) SK_OVERRIDE {
         SkASSERT(count > 0);
 
         SkPoint             srcPt;
         SkMatrix::MapXYProc dstProc = fDstToIndexProc;
         TileProc            proc = fTileProc;
         const uint16_t* SK_RESTRICT cache = this->getCache16();
-        int                 toggle = ((x ^ y) & 1) << kCache16Bits;
+        int                 toggle = ((x ^ y) & 1) * kDitherStride16;
 
         if (fDstToIndexClass != kPerspective_MatrixClass) {
             dstProc(fDstToIndex, SkIntToScalar(x) + SK_ScalarHalf,
                                  SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
-            SkFixed dx, fx = SkScalarToFixed(srcPt.fX);
-            SkFixed dy, fy = SkScalarToFixed(srcPt.fY);
+
+            SkScalar sdx = fDstToIndex.getScaleX();
+            SkScalar sdy = fDstToIndex.getSkewY();
 
             if (fDstToIndexClass == kFixedStepInX_MatrixClass) {
                 SkFixed storage[2];
-                (void)fDstToIndex.fixedStepInX(SkIntToScalar(y), &storage[0], &storage[1]);
-                dx = storage[0];
-                dy = storage[1];
+                (void)fDstToIndex.fixedStepInX(SkIntToScalar(y),
+                                               &storage[0], &storage[1]);
+                sdx = SkFixedToScalar(storage[0]);
+                sdy = SkFixedToScalar(storage[1]);
             } else {
                 SkASSERT(fDstToIndexClass == kLinear_MatrixClass);
-                dx = SkScalarToFixed(fDstToIndex.getScaleX());
-                dy = SkScalarToFixed(fDstToIndex.getSkewY());
             }
 
             RadialShade16Proc shadeProc = shadeSpan16_radial_repeat;
@@ -1361,7 +1465,8 @@
             } else {
                 SkASSERT(proc == repeat_tileproc);
             }
-            (*shadeProc)(fx, dx, fy, dy, dstC, cache, toggle, count);
+            (*shadeProc)(srcPt.fX, sdx, srcPt.fY, sdy, dstC,
+                         cache, toggle, count);
         } else {    // perspective case
             SkScalar dstX = SkIntToScalar(x);
             SkScalar dstY = SkIntToScalar(y);
@@ -1372,7 +1477,7 @@
 
                 int index = fi >> (16 - kCache16Bits);
                 *dstC++ = cache[toggle + index];
-                toggle ^= (1 << kCache16Bits);
+                toggle ^= kDitherStride16;
 
                 dstX += SK_Scalar1;
             } while (--count != 0);
@@ -1388,8 +1493,8 @@
             this->commonAsABitmap(bitmap);
         }
         if (matrix) {
-            matrix->setScale(SkIntToScalar(kCache32Count),
-                             SkIntToScalar(kCache32Count));
+            matrix->setScale(SkIntToScalar(kGradient32Length),
+                             SkIntToScalar(kGradient32Length));
             matrix->preConcat(fPtsToUnit);
         }
         if (xy) {
@@ -1467,27 +1572,35 @@
 
 #define UNPINNED_RADIAL_STEP \
     fi = (fx * fx + fy * fy) >> (14 + 16 - kSQRT_TABLE_BITS); \
-    *dstC++ = cache[sqrt_table[fi] >> Gradient_Shader::kSqrt32Shift]; \
+    *dstC++ = cache[toggle + \
+                    (sqrt_table[fi] >> Gradient_Shader::kSqrt32Shift)]; \
+    toggle ^= Gradient_Shader::kDitherStride32; \
     fx += dx; \
     fy += dy;
 
-typedef void (* RadialShadeProc)(SkFixed fx, SkFixed dx, SkFixed fy, SkFixed dy,
-        SkPMColor* dstC, const SkPMColor* SK_RESTRICT cache,
-        int count, SkPoint& srcPt, float fdx, float fdy);
+typedef void (* RadialShadeProc)(SkScalar sfx, SkScalar sdx,
+        SkScalar sfy, SkScalar sdy,
+        SkPMColor* dstC, const SkPMColor* cache,
+        int count, int toggle);
 
 // On Linux, this is faster with SkPMColor[] params than SkPMColor* SK_RESTRICT
-void shadeSpan_radial_clamp(SkFixed fx, SkFixed dx, SkFixed fy, SkFixed dy,
+void shadeSpan_radial_clamp(SkScalar sfx, SkScalar sdx,
+        SkScalar sfy, SkScalar sdy,
         SkPMColor* SK_RESTRICT dstC, const SkPMColor* SK_RESTRICT cache,
-        int count, SkPoint& srcPt, float fdx, float fdy) {
+        int count, int toggle) {
     // Floating point seems to be slower than fixed point,
     // even when we have float hardware.
     const uint8_t* SK_RESTRICT sqrt_table = gSqrt8Table;
-    fx >>= 1;
-    dx >>= 1;
-    fy >>= 1;
-    dy >>= 1;
+    SkFixed fx = SkScalarToFixed(sfx) >> 1;
+    SkFixed dx = SkScalarToFixed(sdx) >> 1;
+    SkFixed fy = SkScalarToFixed(sfy) >> 1;
+    SkFixed dy = SkScalarToFixed(sdy) >> 1;
     if ((count > 4) && radial_completely_pinned(fx, dx, fy, dy)) {
-        sk_memset32(dstC, cache[Gradient_Shader::kCache32Count - 1], count);
+        unsigned fi = Gradient_Shader::kGradient32Length;
+        sk_memset32_dither(dstC,
+            cache[toggle + fi],
+            cache[(toggle ^ Gradient_Shader::kDitherStride32) + fi],
+            count);
     } else if ((count > 4) &&
                no_need_for_radial_pin(fx, dx, fy, dy, count)) {
         unsigned fi;
@@ -1502,51 +1615,70 @@
         }
     }
     else  {
-        do {
-            unsigned xx = SkPin32(fx, -0xFFFF >> 1, 0xFFFF >> 1);
-            unsigned fi = SkPin32(fy, -0xFFFF >> 1, 0xFFFF >> 1);
-            fi = (xx * xx + fi * fi) >> (14 + 16 - kSQRT_TABLE_BITS);
-            fi = SkFastMin32(fi, 0xFFFF >> (16 - kSQRT_TABLE_BITS));
-            *dstC++ = cache[sqrt_table[fi] >> Gradient_Shader::kSqrt32Shift];
-            fx += dx;
-            fy += dy;
-        } while (--count != 0);
+        // Specializing for dy == 0 gains us 25% on Skia benchmarks
+        if (dy == 0) {
+            unsigned yy = SkPin32(fy, -0xFFFF >> 1, 0xFFFF >> 1);
+            yy *= yy;
+            do {
+                unsigned xx = SkPin32(fx, -0xFFFF >> 1, 0xFFFF >> 1);
+                unsigned fi = (xx * xx + yy) >> (14 + 16 - kSQRT_TABLE_BITS);
+                fi = SkFastMin32(fi, 0xFFFF >> (16 - kSQRT_TABLE_BITS));
+                *dstC++ = cache[toggle + (sqrt_table[fi] >>
+                    Gradient_Shader::kSqrt32Shift)];
+                toggle ^= Gradient_Shader::kDitherStride32;
+                fx += dx;
+            } while (--count != 0);
+        } else {
+            do {
+                unsigned xx = SkPin32(fx, -0xFFFF >> 1, 0xFFFF >> 1);
+                unsigned fi = SkPin32(fy, -0xFFFF >> 1, 0xFFFF >> 1);
+                fi = (xx * xx + fi * fi) >> (14 + 16 - kSQRT_TABLE_BITS);
+                fi = SkFastMin32(fi, 0xFFFF >> (16 - kSQRT_TABLE_BITS));
+                *dstC++ = cache[toggle + (sqrt_table[fi] >>
+                    Gradient_Shader::kSqrt32Shift)];
+                toggle ^= Gradient_Shader::kDitherStride32;
+                fx += dx;
+                fy += dy;
+            } while (--count != 0);
+        }
     }
 }
 
-void shadeSpan_radial_mirror(SkFixed fx, SkFixed dx, SkFixed fy, SkFixed dy,
+// Unrolling this loop doesn't seem to help (when float); we're stalling to
+// get the results of the sqrt (?), and don't have enough extra registers to
+// have many in flight.
+void shadeSpan_radial_mirror(SkScalar sfx, SkScalar sdx,
+        SkScalar sfy, SkScalar sdy,
         SkPMColor* SK_RESTRICT dstC, const SkPMColor* SK_RESTRICT cache,
-        int count, SkPoint& srcPt, float fdx, float fdy) {
-#ifdef SK_USE_FLOAT_SQRT
-    float ffx = srcPt.fX;
-    float ffy = srcPt.fY;
+        int count, int toggle) {
     do {
-        float fdist = sk_float_sqrt(ffx*ffx + ffy*ffy);
-        unsigned fi = mirror_tileproc(SkFloatToFixed(fdist));
-        SkASSERT(fi <= 0xFFFF);
-        *dstC++ = cache[fi >> Gradient_Shader::kCache32Shift];
-        ffx += fdx;
-        ffy += fdy;
-    } while (--count != 0);
+#ifdef SK_SCALAR_IS_FLOAT
+        float fdist = sk_float_sqrt(sfx*sfx + sfy*sfy);
+        SkFixed dist = SkFloatToFixed(fdist);
 #else
-    do {
-        SkFixed magnitudeSquared = SkFixedSquare(fx) +
-            SkFixedSquare(fy);
+        SkFixed magnitudeSquared = SkFixedSquare(sfx) +
+            SkFixedSquare(sfy);
         if (magnitudeSquared < 0) // Overflow.
             magnitudeSquared = SK_FixedMax;
         SkFixed dist = SkFixedSqrt(magnitudeSquared);
+#endif
         unsigned fi = mirror_tileproc(dist);
         SkASSERT(fi <= 0xFFFF);
-        *dstC++ = cache[fi >> Gradient_Shader::kCache32Shift];
-        fx += dx;
-        fy += dy;
+        *dstC++ = cache[toggle + (fi >> Gradient_Shader::kCache32Shift)];
+        toggle ^= Gradient_Shader::kDitherStride32;
+        sfx += sdx;
+        sfy += sdy;
     } while (--count != 0);
-#endif
 }
 
-void shadeSpan_radial_repeat(SkFixed fx, SkFixed dx, SkFixed fy, SkFixed dy,
+void shadeSpan_radial_repeat(SkScalar sfx, SkScalar sdx,
+        SkScalar sfy, SkScalar sdy,
         SkPMColor* SK_RESTRICT dstC, const SkPMColor* SK_RESTRICT cache,
-        int count, SkPoint& srcPt, float fdx, float fdy) {
+        int count, int toggle) {
+    SkFixed fx = SkScalarToFixed(sfx);
+    SkFixed dx = SkScalarToFixed(sdx);
+    SkFixed fy = SkScalarToFixed(sfy);
+    SkFixed dy = SkScalarToFixed(sdy);
     do {
         SkFixed magnitudeSquared = SkFixedSquare(fx) +
             SkFixedSquare(fy);
@@ -1555,7 +1687,8 @@
         SkFixed dist = SkFixedSqrt(magnitudeSquared);
         unsigned fi = repeat_tileproc(dist);
         SkASSERT(fi <= 0xFFFF);
-        *dstC++ = cache[fi >> Gradient_Shader::kCache32Shift];
+        *dstC++ = cache[toggle + (fi >> Gradient_Shader::kCache32Shift)];
+        toggle ^= Gradient_Shader::kDitherStride32;
         fx += dx;
         fy += dy;
     } while (--count != 0);
@@ -1570,32 +1703,26 @@
     SkMatrix::MapXYProc dstProc = fDstToIndexProc;
     TileProc            proc = fTileProc;
     const SkPMColor* SK_RESTRICT cache = this->getCache32();
+#ifdef USE_DITHER_32BIT_GRADIENT
+    int toggle = ((x ^ y) & 1) * Gradient_Shader::kDitherStride32;
+#else
+    int toggle = 0;
+#endif
 
     if (fDstToIndexClass != kPerspective_MatrixClass) {
         dstProc(fDstToIndex, SkIntToScalar(x) + SK_ScalarHalf,
                              SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
-        SkFixed dx, fx = SkScalarToFixed(srcPt.fX);
-        SkFixed dy, fy = SkScalarToFixed(srcPt.fY);
-        float fdx = 0;
-        float fdy = 0;
+        SkScalar sdx = fDstToIndex.getScaleX();
+        SkScalar sdy = fDstToIndex.getSkewY();
 
         if (fDstToIndexClass == kFixedStepInX_MatrixClass) {
             SkFixed storage[2];
-            (void)fDstToIndex.fixedStepInX(SkIntToScalar(y), &storage[0], &storage[1]);
-            dx = storage[0];
-            dy = storage[1];
-#ifdef SK_USE_FLOAT_SQRT
-            fdx = SkFixedToFloat(storage[0]);
-            fdy = SkFixedToFloat(storage[1]);
-#endif
+            (void)fDstToIndex.fixedStepInX(SkIntToScalar(y),
+                                           &storage[0], &storage[1]);
+            sdx = SkFixedToScalar(storage[0]);
+            sdy = SkFixedToScalar(storage[1]);
         } else {
             SkASSERT(fDstToIndexClass == kLinear_MatrixClass);
-            dx = SkScalarToFixed(fDstToIndex.getScaleX());
-            dy = SkScalarToFixed(fDstToIndex.getSkewY());
-#ifdef SK_USE_FLOAT_SQRT
-            fdx = fDstToIndex.getScaleX();
-            fdy = fDstToIndex.getSkewY();
-#endif
         }
 
         RadialShadeProc shadeProc = shadeSpan_radial_repeat;
@@ -1606,7 +1733,7 @@
         } else {
             SkASSERT(proc == repeat_tileproc);
         }
-        (*shadeProc)(fx, dx, fy, dy, dstC, cache, count, srcPt, fdx, fdy);
+        (*shadeProc)(srcPt.fX, sdx, srcPt.fY, sdy, dstC, cache, count, toggle);
     } else {    // perspective case
         SkScalar dstX = SkIntToScalar(x);
         SkScalar dstY = SkIntToScalar(y);
@@ -1720,14 +1847,14 @@
         SkScalar fy, SkScalar dy,
         SkScalar b, SkScalar db,
         SkScalar fSr2D2, SkScalar foura, SkScalar fOneOverTwoA, bool posRoot,
-        SkPMColor* dstC, const SkPMColor* SK_RESTRICT cache,
+        SkPMColor* SK_RESTRICT dstC, const SkPMColor* SK_RESTRICT cache,
         int count);
 
 void shadeSpan_twopoint_clamp(SkScalar fx, SkScalar dx,
         SkScalar fy, SkScalar dy,
         SkScalar b, SkScalar db,
         SkScalar fSr2D2, SkScalar foura, SkScalar fOneOverTwoA, bool posRoot,
-        SkPMColor* dstC, const SkPMColor* SK_RESTRICT cache,
+        SkPMColor* SK_RESTRICT dstC, const SkPMColor* SK_RESTRICT cache,
         int count) {
     for (; count > 0; --count) {
         SkFixed t = two_point_radial(b, fx, fy, fSr2D2, foura,
@@ -1744,7 +1871,7 @@
         SkScalar fy, SkScalar dy,
         SkScalar b, SkScalar db,
         SkScalar fSr2D2, SkScalar foura, SkScalar fOneOverTwoA, bool posRoot,
-        SkPMColor* dstC, const SkPMColor* SK_RESTRICT cache,
+        SkPMColor* SK_RESTRICT dstC, const SkPMColor* SK_RESTRICT cache,
         int count) {
     for (; count > 0; --count) {
         SkFixed t = two_point_radial(b, fx, fy, fSr2D2, foura,
@@ -1762,7 +1889,7 @@
         SkScalar fy, SkScalar dy,
         SkScalar b, SkScalar db,
         SkScalar fSr2D2, SkScalar foura, SkScalar fOneOverTwoA, bool posRoot,
-        SkPMColor* dstC, const SkPMColor* SK_RESTRICT cache,
+        SkPMColor* SK_RESTRICT dstC, const SkPMColor* SK_RESTRICT cache,
         int count) {
     for (; count > 0; --count) {
         SkFixed t = two_point_radial(b, fx, fy, fSr2D2, foura,
@@ -2270,7 +2397,8 @@
 }
 #endif
 
-void Sweep_Gradient::shadeSpan(int x, int y, SkPMColor* SK_RESTRICT dstC, int count) {
+void Sweep_Gradient::shadeSpan(int x, int y, SkPMColor* SK_RESTRICT dstC,
+                               int count) {
     SkMatrix::MapXYProc proc = fDstToIndexProc;
     const SkMatrix&     matrix = fDstToIndex;
     const SkPMColor* SK_RESTRICT cache = this->getCache32();
@@ -2308,11 +2436,12 @@
     }
 }
 
-void Sweep_Gradient::shadeSpan16(int x, int y, uint16_t* SK_RESTRICT dstC, int count) {
+void Sweep_Gradient::shadeSpan16(int x, int y, uint16_t* SK_RESTRICT dstC,
+                                 int count) {
     SkMatrix::MapXYProc proc = fDstToIndexProc;
     const SkMatrix&     matrix = fDstToIndex;
     const uint16_t* SK_RESTRICT cache = this->getCache16();
-    int                 toggle = ((x ^ y) & 1) << kCache16Bits;
+    int                 toggle = ((x ^ y) & 1) * kDitherStride16;
     SkPoint             srcPt;
 
     if (fDstToIndexClass != kPerspective_MatrixClass) {
@@ -2336,7 +2465,7 @@
         for (; count > 0; --count) {
             int index = SkATan2_255(fy, fx) >> (8 - kCache16Bits);
             *dstC++ = cache[toggle + index];
-            toggle ^= (1 << kCache16Bits);
+            toggle ^= kDitherStride16;
             fx += dx;
             fy += dy;
         }
@@ -2348,7 +2477,7 @@
             int index = SkATan2_255(srcPt.fY, srcPt.fX);
             index >>= (8 - kCache16Bits);
             *dstC++ = cache[toggle + index];
-            toggle ^= (1 << kCache16Bits);
+            toggle ^= kDitherStride16;
         }
     }
 }
diff --git a/src/effects/SkLayerDrawLooper.cpp b/src/effects/SkLayerDrawLooper.cpp
index acb3e88..16636dc 100644
--- a/src/effects/SkLayerDrawLooper.cpp
+++ b/src/effects/SkLayerDrawLooper.cpp
@@ -218,6 +218,8 @@
 
     for (int i = 0; i < count; i++) {
         LayerInfo info;
+        if (buffer.getPictureVersion() == PICTURE_VERSION_ICS)
+            info.fFlagsMask = buffer.readInt();
         info.fPaintBits = buffer.readInt();
         info.fColorMode = (SkXfermode::Mode)buffer.readInt();
         info.fOffset.fX = buffer.readScalar();
diff --git a/src/effects/SkMorphologyImageFilter.cpp b/src/effects/SkMorphologyImageFilter.cpp
new file mode 100644
index 0000000..78fabc5
--- /dev/null
+++ b/src/effects/SkMorphologyImageFilter.cpp
@@ -0,0 +1,221 @@
+/*
+ * Copyright 2012 The Android Open Source Project
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkMorphologyImageFilter.h"
+#include "SkColorPriv.h"
+
+SkMorphologyImageFilter::SkMorphologyImageFilter(SkFlattenableReadBuffer& buffer)
+  : INHERITED(buffer) {
+    fRadius.fWidth = buffer.readScalar();
+    fRadius.fHeight = buffer.readScalar();
+}
+
+SkMorphologyImageFilter::SkMorphologyImageFilter(int radiusX, int radiusY)
+    : fRadius(SkISize::Make(radiusX, radiusY)) {
+}
+
+
+void SkMorphologyImageFilter::flatten(SkFlattenableWriteBuffer& buffer) {
+    this->INHERITED::flatten(buffer);
+    buffer.writeScalar(fRadius.fWidth);
+    buffer.writeScalar(fRadius.fHeight);
+}
+
+static void erode(const SkPMColor* src, SkPMColor* dst,
+                  int radius, int width, int height,
+                  int srcStrideX, int srcStrideY,
+                  int dstStrideX, int dstStrideY)
+{
+    const SkPMColor* upperSrc = src + SkMin32(radius, width - 1) * srcStrideX;
+    for (int x = 0; x < width; ++x) {
+        const SkPMColor* lp = src;
+        const SkPMColor* up = upperSrc;
+        SkPMColor* dptr = dst;
+        for (int y = 0; y < height; ++y) {
+            int minB = 255, minG = 255, minR = 255, minA = 255;
+            for (const SkPMColor* p = lp; p <= up; p += srcStrideX) {
+                int b = SkGetPackedB32(*p);
+                int g = SkGetPackedG32(*p);
+                int r = SkGetPackedR32(*p);
+                int a = SkGetPackedA32(*p);
+                if (b < minB) minB = b;
+                if (g < minG) minG = g;
+                if (r < minR) minR = r;
+                if (a < minA) minA = a;
+            }
+            *dptr = SkPackARGB32(minA, minR, minG, minB);
+            dptr += dstStrideY;
+            lp += srcStrideY;
+            up += srcStrideY;
+        }
+        if (x >= radius) src += srcStrideX;
+        if (x + radius < width - 1) upperSrc += srcStrideX;
+        dst += dstStrideX;
+    }
+}
+
+static void erodeX(const SkBitmap& src, SkBitmap* dst, int radiusX)
+{
+    erode(src.getAddr32(0, 0), dst->getAddr32(0, 0),
+          radiusX, src.width(), src.height(),
+          1, src.rowBytesAsPixels(), 1, dst->rowBytesAsPixels());
+}
+
+static void erodeY(const SkBitmap& src, SkBitmap* dst, int radiusY)
+{
+    erode(src.getAddr32(0, 0), dst->getAddr32(0, 0),
+          radiusY, src.height(), src.width(),
+          src.rowBytesAsPixels(), 1, dst->rowBytesAsPixels(), 1);
+}
+
+static void dilate(const SkPMColor* src, SkPMColor* dst,
+                   int radius, int width, int height,
+                   int srcStrideX, int srcStrideY,
+                   int dstStrideX, int dstStrideY)
+{
+    const SkPMColor* upperSrc = src + SkMin32(radius, width - 1) * srcStrideX;
+    for (int x = 0; x < width; ++x) {
+        const SkPMColor* lp = src;
+        const SkPMColor* up = upperSrc;
+        SkPMColor* dptr = dst;
+        for (int y = 0; y < height; ++y) {
+            int maxB = 0, maxG = 0, maxR = 0, maxA = 0;
+            for (const SkPMColor* p = lp; p <= up; p += srcStrideX) {
+                int b = SkGetPackedB32(*p);
+                int g = SkGetPackedG32(*p);
+                int r = SkGetPackedR32(*p);
+                int a = SkGetPackedA32(*p);
+                if (b > maxB) maxB = b;
+                if (g > maxG) maxG = g;
+                if (r > maxR) maxR = r;
+                if (a > maxA) maxA = a;
+            }
+            *dptr = SkPackARGB32(maxA, maxR, maxG, maxB);
+            dptr += dstStrideY;
+            lp += srcStrideY;
+            up += srcStrideY;
+        }
+        if (x >= radius) src += srcStrideX;
+        if (x + radius < width - 1) upperSrc += srcStrideX;
+        dst += dstStrideX;
+    }
+}
+
+static void dilateX(const SkBitmap& src, SkBitmap* dst, int radiusX)
+{
+    dilate(src.getAddr32(0, 0), dst->getAddr32(0, 0),
+           radiusX, src.width(), src.height(),
+           1, src.rowBytesAsPixels(), 1, dst->rowBytesAsPixels());
+}
+
+static void dilateY(const SkBitmap& src, SkBitmap* dst, int radiusY)
+{
+    dilate(src.getAddr32(0, 0), dst->getAddr32(0, 0),
+           radiusY, src.height(), src.width(),
+           src.rowBytesAsPixels(), 1, dst->rowBytesAsPixels(), 1);
+}
+
+bool SkErodeImageFilter::onFilterImage(Proxy*,
+                                       const SkBitmap& src, const SkMatrix&,
+                                       SkBitmap* dst, SkIPoint*) {
+    if (src.config() != SkBitmap::kARGB_8888_Config) {
+        return false;
+    }
+
+    SkAutoLockPixels alp(src);
+    if (!src.getPixels()) {
+        return false;
+    }
+
+    dst->setConfig(src.config(), src.width(), src.height());
+    dst->allocPixels();
+
+    int width = radius().width();
+    int height = radius().height();
+
+    if (width < 0 || height < 0) {
+        return false;
+    }
+
+    if (width == 0 && height == 0) {
+        src.copyTo(dst, dst->config());
+        return true;
+    }
+
+    SkBitmap temp;
+    temp.setConfig(dst->config(), dst->width(), dst->height());
+    if (!temp.allocPixels()) {
+        return false;
+    }
+
+    if (width > 0 && height > 0) {
+        erodeX(src, &temp, width);
+        erodeY(temp, dst, height);
+    } else if (width > 0) {
+        erodeX(src, dst, width);
+    } else if (height > 0) {
+        erodeY(src, dst, height);
+    }
+    return true;
+}
+
+bool SkDilateImageFilter::onFilterImage(Proxy*,
+                                        const SkBitmap& src, const SkMatrix&,
+                                        SkBitmap* dst, SkIPoint*) {
+    if (src.config() != SkBitmap::kARGB_8888_Config) {
+        return false;
+    }
+
+    SkAutoLockPixels alp(src);
+    if (!src.getPixels()) {
+        return false;
+    }
+
+    dst->setConfig(src.config(), src.width(), src.height());
+    dst->allocPixels();
+
+    int width = radius().width();
+    int height = radius().height();
+
+    if (width < 0 || height < 0) {
+        return false;
+    }
+
+    if (width == 0 && height == 0) {
+        src.copyTo(dst, dst->config());
+        return true;
+    }
+
+    SkBitmap temp;
+    temp.setConfig(dst->config(), dst->width(), dst->height());
+    if (!temp.allocPixels()) {
+        return false;
+    }
+
+    if (width > 0 && height > 0) {
+        dilateX(src, &temp, width);
+        dilateY(temp, dst, height);
+    } else if (width > 0) {
+        dilateX(src, dst, width);
+    } else if (height > 0) {
+        dilateY(src, dst, height);
+    }
+    return true;
+}
+
+bool SkDilateImageFilter::asADilate(SkISize* radius) const {
+    *radius = this->radius();
+    return true;
+}
+
+bool SkErodeImageFilter::asAnErode(SkISize* radius) const {
+    *radius = this->radius();
+    return true;
+}
+
+SK_DEFINE_FLATTENABLE_REGISTRAR(SkDilateImageFilter)
+SK_DEFINE_FLATTENABLE_REGISTRAR(SkErodeImageFilter)
diff --git a/src/effects/SkTableMaskFilter.cpp b/src/effects/SkTableMaskFilter.cpp
index 5842e4b..4024372 100644
--- a/src/effects/SkTableMaskFilter.cpp
+++ b/src/effects/SkTableMaskFilter.cpp
@@ -92,10 +92,13 @@
 ///////////////////////////////////////////////////////////////////////////////
 
 void SkTableMaskFilter::MakeGammaTable(uint8_t table[256], SkScalar gamma) {
-    float x = 0;
     const float dx = 1 / 255.0f;
+    const float g = SkScalarToFloat(gamma);
+
+    float x = 0;
     for (int i = 0; i < 256; i++) {
-        table[i] = SkPin32(SkScalarRound(powf(x, gamma) * 255), 0, 255);
+        float ee = powf(x, g) * 255;
+        table[i] = SkPin32(sk_float_round2int(powf(x, g) * 255), 0, 255);
         x += dx;
     }
 }
diff --git a/src/gpu/GrAAConvexPathRenderer.cpp b/src/gpu/GrAAConvexPathRenderer.cpp
new file mode 100644
index 0000000..60749d8
--- /dev/null
+++ b/src/gpu/GrAAConvexPathRenderer.cpp
@@ -0,0 +1,489 @@
+
+/*
+ * Copyright 2012 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "GrAAConvexPathRenderer.h"
+
+#include "GrContext.h"
+#include "GrDrawState.h"
+#include "GrPathUtils.h"
+#include "SkString.h"
+#include "SkTrace.h"
+
+
+GrAAConvexPathRenderer::GrAAConvexPathRenderer() {
+}
+
+namespace {
+
+struct Segment {
+    enum {
+        kLine,
+        kQuad
+    } fType;
+    // line uses one pt, quad uses 2 pts
+    GrPoint fPts[2];
+    // normal to edge ending at each pt
+    GrVec fNorms[2];
+    // is the corner where the previous segment meets this segment
+    // sharp. If so, fMid is a normalized bisector facing outward.
+    GrVec fMid;
+
+    int countPoints() {
+        return (kLine == fType) ? 1 : 2;
+    }
+    const SkPoint& endPt() const {
+        return (kLine == fType) ? fPts[0] : fPts[1];
+    };
+    const SkPoint& endNorm() const {
+        return (kLine == fType) ? fNorms[0] : fNorms[1];
+    };
+};
+
+typedef SkTArray<Segment, true> SegmentArray;
+
+void center_of_mass(const SegmentArray& segments, SkPoint* c) {
+    GrScalar area = 0;
+    SkPoint center;
+    center.set(0, 0);
+    int count = segments.count();
+    SkPoint p0;
+    if (count > 2) {
+        // We translate the polygon so that the first point is at the origin.
+        // This avoids some precision issues with small area polygons far away
+        // from the origin.
+        p0 = segments[0].endPt();
+        SkPoint pi;
+        SkPoint pj;
+        // the first and last interation of the below loop would compute
+        // zeros since the starting / ending point is (0,0). So instead we start
+        // at i=1 and make the last iteration i=count-2.
+        pj = segments[1].endPt() - p0;
+        for (int i = 1; i < count - 1; ++i) {
+            pi = pj;
+            const SkPoint pj = segments[i + 1].endPt() - p0;
+
+            GrScalar t = GrMul(pi.fX, pj.fY) - GrMul(pj.fX, pi.fY);
+            area += t;
+            center.fX += (pi.fX + pj.fX) * t;
+            center.fY += (pi.fY + pj.fY) * t;
+
+        }
+    }
+    // If the poly has no area then we instead return the average of
+    // its points.
+    if (SkScalarNearlyZero(area)) {
+        SkPoint avg;
+        avg.set(0, 0);
+        for (int i = 0; i < count; ++i) {
+            const SkPoint& pt = segments[i].endPt();
+            avg.fX += pt.fX;
+            avg.fY += pt.fY;
+        }
+        SkScalar denom = SK_Scalar1 / count;
+        avg.scale(denom);
+        *c = avg;
+    } else {
+        area *= 3;
+        area = GrScalarDiv(GR_Scalar1, area);
+        center.fX = GrScalarMul(center.fX, area);
+        center.fY = GrScalarMul(center.fY, area);
+        // undo the translate of p0 to the origin.
+        *c = center + p0;
+    }
+    GrAssert(!SkScalarIsNaN(c->fX) && !SkScalarIsNaN(c->fY));
+}
+
+void compute_vectors(SegmentArray* segments,
+                     SkPoint* fanPt,
+                     SkPath::Direction dir,
+                     int* vCount,
+                     int* iCount) {
+    center_of_mass(*segments, fanPt);
+    int count = segments->count();
+
+    // Make the normals point towards the outside
+    GrPoint::Side normSide;
+    if (dir == SkPath::kCCW_Direction) {
+        normSide = GrPoint::kRight_Side;
+    } else {
+        normSide = GrPoint::kLeft_Side;
+    }
+
+    *vCount = 0;
+    *iCount = 0;
+    // compute normals at all points
+    for (int a = 0; a < count; ++a) {
+        const Segment& sega = (*segments)[a];
+        int b = (a + 1) % count;
+        Segment& segb = (*segments)[b];
+
+        const GrPoint* prevPt = &sega.endPt();
+        int n = segb.countPoints();
+        for (int p = 0; p < n; ++p) {
+            segb.fNorms[p] = segb.fPts[p] - *prevPt;
+            segb.fNorms[p].normalize();
+            segb.fNorms[p].setOrthog(segb.fNorms[p], normSide);
+            prevPt = &segb.fPts[p];
+        }
+        if (Segment::kLine == segb.fType) {
+            *vCount += 5;
+            *iCount += 9;
+        } else {
+            *vCount += 6;
+            *iCount += 12;
+        }
+    }
+
+    // compute mid-vectors where segments meet. TODO: Detect shallow corners
+    // and leave out the wedges and close gaps by stitching segments together.
+    for (int a = 0; a < count; ++a) {
+        const Segment& sega = (*segments)[a];
+        int b = (a + 1) % count;
+        Segment& segb = (*segments)[b];
+        segb.fMid = segb.fNorms[0] + sega.endNorm();
+        segb.fMid.normalize();
+        // corner wedges
+        *vCount += 4;
+        *iCount += 6;
+    }
+}
+
+struct DegenerateTestData {
+    DegenerateTestData() { fStage = kInitial; }
+    bool isDegenerate() const { return kNonDegenerate != fStage; }
+    enum {
+        kInitial,
+        kPoint,
+        kLine,
+        kNonDegenerate
+    }           fStage;
+    GrPoint     fFirstPoint;
+    GrVec       fLineNormal;
+    GrScalar    fLineC;
+};
+
+void update_degenerate_test(DegenerateTestData* data, const GrPoint& pt) {
+    static const SkScalar TOL = (SK_Scalar1 / 16);
+    static const SkScalar TOL_SQD = SkScalarMul(TOL, TOL);
+
+    switch (data->fStage) {
+        case DegenerateTestData::kInitial:
+            data->fFirstPoint = pt;
+            data->fStage = DegenerateTestData::kPoint;
+            break;
+        case DegenerateTestData::kPoint:
+            if (pt.distanceToSqd(data->fFirstPoint) > TOL_SQD) {
+                data->fLineNormal = pt - data->fFirstPoint;
+                data->fLineNormal.normalize();
+                data->fLineNormal.setOrthog(data->fLineNormal);
+                data->fLineC = -data->fLineNormal.dot(data->fFirstPoint);
+                data->fStage = DegenerateTestData::kLine;
+            }
+            break;
+        case DegenerateTestData::kLine:
+            if (SkScalarAbs(data->fLineNormal.dot(pt) + data->fLineC) > TOL) {
+                data->fStage = DegenerateTestData::kNonDegenerate;
+            }
+        case DegenerateTestData::kNonDegenerate:
+            break;
+        default:
+            GrCrash("Unexpected degenerate test stage.");
+    }
+}
+
+bool get_segments(const GrPath& path,
+                 SegmentArray* segments,
+                 SkPoint* fanPt,
+                 int* vCount,
+                 int* iCount) {
+    SkPath::Iter iter(path, true);
+    // This renderer overemphasises very thin path regions. We use the distance
+    // to the path from the sample to compute coverage. Every pixel intersected
+    // by the path will be hit and the maximum distance is sqrt(2)/2. We don't
+    // notice that the sample may be close to a very thin area of the path and 
+    // thus should be very light. This is particularly egregious for degenerate
+    // line paths. We detect paths that are very close to a line (zero area) and
+    // draw nothing.
+    DegenerateTestData degenerateData;
+
+    for (;;) {
+        GrPoint pts[4];
+        GrPathCmd cmd = (GrPathCmd)iter.next(pts);
+        switch (cmd) {
+            case kMove_PathCmd:
+                update_degenerate_test(&degenerateData, pts[0]);
+                break;
+            case kLine_PathCmd: {
+                update_degenerate_test(&degenerateData, pts[1]);
+                segments->push_back();
+                segments->back().fType = Segment::kLine;
+                segments->back().fPts[0] = pts[1];
+                break;
+            }
+            case kQuadratic_PathCmd:
+                update_degenerate_test(&degenerateData, pts[1]);
+                update_degenerate_test(&degenerateData, pts[2]);
+                segments->push_back();
+                segments->back().fType = Segment::kQuad;
+                segments->back().fPts[0] = pts[1];
+                segments->back().fPts[1] = pts[2];
+                break;
+            case kCubic_PathCmd: {
+                update_degenerate_test(&degenerateData, pts[1]);
+                update_degenerate_test(&degenerateData, pts[2]);
+                update_degenerate_test(&degenerateData, pts[3]);
+                SkSTArray<15, SkPoint, true> quads;
+                GrPathUtils::convertCubicToQuads(pts, SK_Scalar1, &quads);
+                int count = quads.count();
+                for (int q = 0; q < count; q += 3) {
+                    segments->push_back();
+                    segments->back().fType = Segment::kQuad;
+                    segments->back().fPts[0] = quads[q + 1];
+                    segments->back().fPts[1] = quads[q + 2];
+                }
+                break;
+            };
+            case kEnd_PathCmd:
+                if (degenerateData.isDegenerate()) {
+                    return false;
+                } else {
+                    SkPath::Direction dir;
+                    GR_DEBUGCODE(bool succeeded = )
+                    path.cheapComputeDirection(&dir);
+                    GrAssert(succeeded);
+                    compute_vectors(segments, fanPt, dir, vCount, iCount);
+                    return true;
+                }
+            default:
+                break;
+        }
+    }
+}
+
+struct QuadVertex {
+    GrPoint  fPos;
+    GrPoint  fUV;
+    GrScalar fD0;
+    GrScalar fD1;
+};
+    
+void create_vertices(const SegmentArray&  segments,
+                     const SkPoint& fanPt,
+                     QuadVertex*    verts,
+                     uint16_t*      idxs) {
+    int v = 0;
+    int i = 0;
+
+    int count = segments.count();
+    for (int a = 0; a < count; ++a) {
+        const Segment& sega = segments[a];
+        int b = (a + 1) % count;
+        const Segment& segb = segments[b];
+        
+        // FIXME: These tris are inset in the 1 unit arc around the corner
+        verts[v + 0].fPos = sega.endPt();
+        verts[v + 1].fPos = verts[v + 0].fPos + sega.endNorm();
+        verts[v + 2].fPos = verts[v + 0].fPos + segb.fMid;
+        verts[v + 3].fPos = verts[v + 0].fPos + segb.fNorms[0];
+        verts[v + 0].fUV.set(0,0);
+        verts[v + 1].fUV.set(0,-SK_Scalar1);
+        verts[v + 2].fUV.set(0,-SK_Scalar1);
+        verts[v + 3].fUV.set(0,-SK_Scalar1);
+        verts[v + 0].fD0 = verts[v + 0].fD1 = -SK_Scalar1;
+        verts[v + 1].fD0 = verts[v + 1].fD1 = -SK_Scalar1;
+        verts[v + 2].fD0 = verts[v + 2].fD1 = -SK_Scalar1;
+        verts[v + 3].fD0 = verts[v + 3].fD1 = -SK_Scalar1;
+        
+        idxs[i + 0] = v + 0;
+        idxs[i + 1] = v + 2;
+        idxs[i + 2] = v + 1;
+        idxs[i + 3] = v + 0;
+        idxs[i + 4] = v + 3;
+        idxs[i + 5] = v + 2;
+        
+        v += 4;
+        i += 6;
+
+        if (Segment::kLine == segb.fType) {
+            verts[v + 0].fPos = fanPt;
+            verts[v + 1].fPos = sega.endPt();
+            verts[v + 2].fPos = segb.fPts[0];
+
+            verts[v + 3].fPos = verts[v + 1].fPos + segb.fNorms[0];
+            verts[v + 4].fPos = verts[v + 2].fPos + segb.fNorms[0];
+
+            // we draw the line edge as a degenerate quad (u is 0, v is the
+            // signed distance to the edge)
+            GrScalar dist = fanPt.distanceToLineBetween(verts[v + 1].fPos,
+                                                        verts[v + 2].fPos);
+            verts[v + 0].fUV.set(0, dist);
+            verts[v + 1].fUV.set(0, 0);
+            verts[v + 2].fUV.set(0, 0);
+            verts[v + 3].fUV.set(0, -SK_Scalar1);
+            verts[v + 4].fUV.set(0, -SK_Scalar1);
+
+            verts[v + 0].fD0 = verts[v + 0].fD1 = -SK_Scalar1;
+            verts[v + 1].fD0 = verts[v + 1].fD1 = -SK_Scalar1;
+            verts[v + 2].fD0 = verts[v + 2].fD1 = -SK_Scalar1;
+            verts[v + 3].fD0 = verts[v + 3].fD1 = -SK_Scalar1;
+            verts[v + 4].fD0 = verts[v + 4].fD1 = -SK_Scalar1;
+
+            idxs[i + 0] = v + 0;
+            idxs[i + 1] = v + 2;
+            idxs[i + 2] = v + 1;
+
+            idxs[i + 3] = v + 3;
+            idxs[i + 4] = v + 1;
+            idxs[i + 5] = v + 2;
+
+            idxs[i + 6] = v + 4;
+            idxs[i + 7] = v + 3;
+            idxs[i + 8] = v + 2;
+
+            v += 5;
+            i += 9;
+        } else {
+            GrPoint qpts[] = {sega.endPt(), segb.fPts[0], segb.fPts[1]};
+
+            GrVec midVec = segb.fNorms[0] + segb.fNorms[1];
+            midVec.normalize();
+
+            verts[v + 0].fPos = fanPt;
+            verts[v + 1].fPos = qpts[0];
+            verts[v + 2].fPos = qpts[2];
+            verts[v + 3].fPos = qpts[0] + segb.fNorms[0];
+            verts[v + 4].fPos = qpts[2] + segb.fNorms[1];
+            verts[v + 5].fPos = qpts[1] + midVec;
+
+            GrScalar c = segb.fNorms[0].dot(qpts[0]);
+            verts[v + 0].fD0 =  -segb.fNorms[0].dot(fanPt) + c;
+            verts[v + 1].fD0 =  0.f;
+            verts[v + 2].fD0 =  -segb.fNorms[0].dot(qpts[2]) + c;
+            verts[v + 3].fD0 = -GR_ScalarMax/100;
+            verts[v + 4].fD0 = -GR_ScalarMax/100;
+            verts[v + 5].fD0 = -GR_ScalarMax/100;
+
+            c = segb.fNorms[1].dot(qpts[2]);
+            verts[v + 0].fD1 =  -segb.fNorms[1].dot(fanPt) + c;
+            verts[v + 1].fD1 =  -segb.fNorms[1].dot(qpts[0]) + c;
+            verts[v + 2].fD1 =  0.f;
+            verts[v + 3].fD1 = -GR_ScalarMax/100;
+            verts[v + 4].fD1 = -GR_ScalarMax/100;
+            verts[v + 5].fD1 = -GR_ScalarMax/100;
+
+            GrMatrix toUV;
+            GrPathUtils::quadDesignSpaceToUVCoordsMatrix(qpts, &toUV);
+            toUV.mapPointsWithStride(&verts[v].fUV,
+                                     &verts[v].fPos,
+                                     sizeof(QuadVertex),
+                                     6);
+
+            idxs[i + 0] = v + 3;
+            idxs[i + 1] = v + 1;
+            idxs[i + 2] = v + 2;
+            idxs[i + 3] = v + 4;
+            idxs[i + 4] = v + 3;
+            idxs[i + 5] = v + 2;
+
+            idxs[i + 6] = v + 5;
+            idxs[i + 7] = v + 3;
+            idxs[i + 8] = v + 4;
+
+            idxs[i +  9] = v + 0;
+            idxs[i + 10] = v + 2;
+            idxs[i + 11] = v + 1;
+
+            v += 6;
+            i += 12;
+        }
+    }
+}
+
+}
+
+bool GrAAConvexPathRenderer::canDrawPath(const SkPath& path,
+                                         GrPathFill fill,
+                                         const GrDrawTarget* target,
+                                         bool antiAlias) const {
+    if (!target->getCaps().fShaderDerivativeSupport || !antiAlias ||
+        kHairLine_PathFill == fill || GrIsFillInverted(fill) ||
+        !path.isConvex()) {
+        return false;
+    }  else {
+        return true;
+    }
+}
+
+bool GrAAConvexPathRenderer::onDrawPath(const SkPath& origPath,
+                                        GrPathFill fill,
+                                        const GrVec* translate,
+                                        GrDrawTarget* target,
+                                        GrDrawState::StageMask stageMask,
+                                        bool antiAlias) {
+
+
+    if (origPath.isEmpty()) {
+        return true;
+    }
+    GrDrawState* drawState = target->drawState();
+
+    GrDrawTarget::AutoStateRestore asr;
+    GrMatrix vm = drawState->getViewMatrix();
+    if (NULL != translate) {
+        vm.postTranslate(translate->fX, translate->fY);
+    }
+    asr.set(target);
+    GrMatrix ivm;
+    if (vm.invert(&ivm)) {
+        drawState->preConcatSamplerMatrices(stageMask, ivm);
+    }
+    drawState->setViewMatrix(GrMatrix::I());
+
+    SkPath path;
+    origPath.transform(vm, &path);
+
+    GrVertexLayout layout = 0;
+    for (int s = 0; s < GrDrawState::kNumStages; ++s) {
+        if ((1 << s) & stageMask) {
+            layout |= GrDrawTarget::StagePosAsTexCoordVertexLayoutBit(s);
+        }
+    }
+    layout |= GrDrawTarget::kEdge_VertexLayoutBit;
+
+    QuadVertex *verts;
+    uint16_t* idxs;
+
+    int vCount;
+    int iCount;
+    SegmentArray segments;
+    SkPoint fanPt;
+    if (!get_segments(path, &segments, &fanPt, &vCount, &iCount)) {
+        return false;
+    }
+
+    if (!target->reserveVertexSpace(layout,
+                                    vCount,
+                                    reinterpret_cast<void**>(&verts))) {
+        return false;
+    }
+    if (!target->reserveIndexSpace(iCount, reinterpret_cast<void**>(&idxs))) {
+        target->resetVertexSource();
+        return false;
+    }
+
+    create_vertices(segments, fanPt, verts, idxs);
+
+    drawState->setVertexEdgeType(GrDrawState::kQuad_EdgeType);
+    target->drawIndexed(kTriangles_PrimitiveType,
+                        0,        // start vertex
+                        0,        // start index
+                        vCount,
+                        iCount);
+    return true;
+}
+
diff --git a/src/gpu/GrAAConvexPathRenderer.h b/src/gpu/GrAAConvexPathRenderer.h
new file mode 100644
index 0000000..df0c001
--- /dev/null
+++ b/src/gpu/GrAAConvexPathRenderer.h
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright 2012 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "GrPathRenderer.h"
+
+
+class GrAAConvexPathRenderer : public GrPathRenderer {
+public:
+    GrAAConvexPathRenderer();
+
+    virtual bool canDrawPath(const SkPath& path,
+                             GrPathFill fill,
+                             const GrDrawTarget* target,
+                             bool antiAlias) const SK_OVERRIDE;
+protected:
+    virtual bool onDrawPath(const SkPath& path,
+                            GrPathFill fill,
+                            const GrVec* translate,
+                            GrDrawTarget* target,
+                            GrDrawState::StageMask stageMask,
+                            bool antiAlias) SK_OVERRIDE;
+};
diff --git a/src/gpu/GrAAHairLinePathRenderer.cpp b/src/gpu/GrAAHairLinePathRenderer.cpp
index 29db9aa..3ed4488 100644
--- a/src/gpu/GrAAHairLinePathRenderer.cpp
+++ b/src/gpu/GrAAHairLinePathRenderer.cpp
@@ -99,7 +99,6 @@
     linesIndexBuffer->ref();
     fQuadsIndexBuffer = quadsIndexBuffer;
     quadsIndexBuffer->ref();
-    this->resetGeom();
 }
 
 GrAAHairLinePathRenderer::~GrAAHairLinePathRenderer() {
@@ -107,98 +106,12 @@
     fQuadsIndexBuffer->unref();
 }
 
-bool GrAAHairLinePathRenderer::canDrawPath(const GrDrawTarget::Caps& targetCaps,
-                                           const SkPath& path,
-                                           GrPathFill fill,
-                                           bool antiAlias) const {
-    static const uint32_t gReqDerivMask = SkPath::kCubic_SegmentMask |
-                                          SkPath::kQuad_SegmentMask;
-    return (kHairLine_PathFill == fill &&
-            antiAlias &&
-            (targetCaps.fShaderDerivativeSupport ||
-             !(gReqDerivMask & path.getSegmentMasks())));
-}
-
-void GrAAHairLinePathRenderer::pathWillClear() {
-    this->resetGeom();
-}
-
-void GrAAHairLinePathRenderer::resetGeom() {
-    fPreviousStages = ~0;
-    fPreviousRTHeight = ~0;
-    fPreviousViewMatrix = GrMatrix::InvalidMatrix();
-    fLineSegmentCnt = 0;
-    fQuadCnt = 0; 
-    if ((fQuadCnt || fLineSegmentCnt) && NULL != fTarget) {
-        fTarget->resetVertexSource();
-    }
-}
-
 namespace {
 
 typedef SkTArray<SkPoint, true> PtArray;
 #define PREALLOC_PTARRAY(N) SkSTArray<(N),SkPoint, true>
 typedef SkTArray<int, true> IntArray;
 
-/**
- * We convert cubics to quadratics (for now).
- */
-void convert_noninflect_cubic_to_quads(const SkPoint p[4],
-                                       SkScalar tolScale,
-                                       PtArray* quads,
-                                       int sublevel = 0) {
-    SkVector ab = p[1];
-    ab -= p[0];
-    SkVector dc = p[2];
-    dc -= p[3];
-
-    static const SkScalar gLengthScale = 3 * SK_Scalar1 / 2;
-    // base tolerance is 2 pixels in dev coords.
-    const SkScalar distanceSqdTol = SkScalarMul(tolScale, 2 * SK_Scalar1);
-    static const int kMaxSubdivs = 10;
-
-    ab.scale(gLengthScale);
-    dc.scale(gLengthScale);
-
-    SkVector c0 = p[0];
-    c0 += ab;
-    SkVector c1 = p[3];
-    c1 += dc;
-
-    SkScalar dSqd = c0.distanceToSqd(c1);
-    if (sublevel > kMaxSubdivs || dSqd <= distanceSqdTol) {
-        SkPoint cAvg = c0;
-        cAvg += c1;
-        cAvg.scale(SK_ScalarHalf);
-
-        SkPoint* pts = quads->push_back_n(3);
-        pts[0] = p[0];
-        pts[1] = cAvg;
-        pts[2] = p[3];
-
-        return;
-    } else {
-        SkPoint choppedPts[7];
-        SkChopCubicAtHalf(p, choppedPts);
-        convert_noninflect_cubic_to_quads(choppedPts + 0, tolScale, 
-                                          quads, sublevel + 1);
-        convert_noninflect_cubic_to_quads(choppedPts + 3, tolScale,
-                                          quads, sublevel + 1);
-    }
-}
-
-void convert_cubic_to_quads(const SkPoint p[4],
-                            SkScalar tolScale,
-                            PtArray* quads) {
-    SkPoint chopped[13];
-    int count = SkChopCubicAtInflections(p, chopped);
-
-    for (int i = 0; i < count; ++i) {
-        SkPoint* cubic = chopped + 3*i;
-        convert_noninflect_cubic_to_quads(cubic, tolScale, quads);
-    }
-}
-
 // Takes 178th time of logf on Z600 / VC2010
 int get_float_exp(float x) {
     GR_STATIC_ASSERT(sizeof(int) == sizeof(float));
@@ -350,14 +263,15 @@
                 bounds.roundOut(&ibounds);
                 if (SkIRect::Intersects(clip, ibounds)) {
                     PREALLOC_PTARRAY(32) q;
-                    // in perspective have to do conversion in src space
+                    // We convert cubics to quadratics (for now).
+                    // In perspective have to do conversion in src space.
                     if (persp) {
                         SkScalar tolScale = 
                             GrPathUtils::scaleToleranceToSrc(SK_Scalar1, m,
                                                              path.getBounds());
-                        convert_cubic_to_quads(pts, tolScale, &q);
+                        GrPathUtils::convertCubicToQuads(pts, tolScale, &q);
                     } else {
-                        convert_cubic_to_quads(devPts, SK_Scalar1, &q);
+                        GrPathUtils::convertCubicToQuads(devPts, SK_Scalar1, &q);
                     }
                     for (int i = 0; i < q.count(); i += 3) {
                         SkPoint* qInDevSpace;
@@ -447,24 +361,9 @@
     SkPoint b = qpts[1];
     SkPoint c = qpts[2];
 
-    // compute a matrix that goes from device coords to U,V quad params
     // this should be in the src space, not dev coords, when we have perspective
     SkMatrix DevToUV;
-    DevToUV.setAll(a.fX,           b.fX,          c.fX,
-                   a.fY,           b.fY,          c.fY,
-                   SK_Scalar1,     SK_Scalar1,    SK_Scalar1);
-    DevToUV.invert(&DevToUV);
-    // can't make this static, no cons :(
-    SkMatrix UVpts;
-    UVpts.setAll(0,                 SK_ScalarHalf,  SK_Scalar1,
-                 0,                 0,              SK_Scalar1,
-                 SK_Scalar1,        SK_Scalar1,     SK_Scalar1);
-    DevToUV.postConcat(UVpts);
-
-    // We really want to avoid perspective matrix muls.
-    // These may wind up really close to zero
-    DevToUV.setPerspX(0);
-    DevToUV.setPerspY(0);
+    GrPathUtils::quadDesignSpaceToUVCoordsMatrix(qpts, &DevToUV);
 
     if (toDevice) {
         toDevice->mapPoints(&a, 1);
@@ -601,28 +500,23 @@
 
 }
 
-bool GrAAHairLinePathRenderer::createGeom(GrDrawState::StageMask stageMask) {
-    const GrDrawState& drawState = fTarget->getDrawState();
+bool GrAAHairLinePathRenderer::createGeom(const SkPath& path,
+                                          const GrVec* translate,
+                                          GrDrawTarget* target,
+                                          GrDrawState::StageMask stageMask,
+                                          int* lineCnt,
+                                          int* quadCnt) {
+    const GrDrawState& drawState = target->getDrawState();
     int rtHeight = drawState.getRenderTarget()->height();
 
     GrIRect clip;
-    if (fTarget->getClip().hasConservativeBounds()) {
-        GrRect clipRect =  fTarget->getClip().getConservativeBounds();
+    if (target->getClip().hasConservativeBounds()) {
+        GrRect clipRect =  target->getClip().getConservativeBounds();
         clipRect.roundOut(&clip);
     } else {
         clip.setLargest();
     }
 
-    // If none of the inputs that affect generation of path geometry have
-    // have changed since last previous path draw then we can reuse the
-    // previous geoemtry.
-    if (stageMask == fPreviousStages &&
-        fPreviousViewMatrix == drawState.getViewMatrix() &&
-        fPreviousTranslate == fTranslate &&
-        rtHeight == fPreviousRTHeight &&
-        fClipRect == clip) {
-        return true;
-    }
 
     GrVertexLayout layout = GrDrawTarget::kEdge_VertexLayoutBit;
     for (int s = 0; s < GrDrawState::kNumStages; ++s) {
@@ -636,19 +530,22 @@
     PREALLOC_PTARRAY(128) lines;
     PREALLOC_PTARRAY(128) quads;
     IntArray qSubdivs;
-    fQuadCnt = generate_lines_and_quads(*fPath, viewM, fTranslate, clip,
+    static const GrVec gZeroVec = {0, 0};
+    if (NULL == translate) {
+        translate = &gZeroVec;
+    }
+    *quadCnt = generate_lines_and_quads(path, viewM, *translate, clip,
                                         &lines, &quads, &qSubdivs);
 
-    fLineSegmentCnt = lines.count() / 2;
-    int vertCnt = kVertsPerLineSeg * fLineSegmentCnt + kVertsPerQuad * fQuadCnt;
+    *lineCnt = lines.count() / 2;
+    int vertCnt = kVertsPerLineSeg * *lineCnt + kVertsPerQuad * *quadCnt;
 
     GrAssert(sizeof(Vertex) == GrDrawTarget::VertexSize(layout));
 
     Vertex* verts;
-    if (!fTarget->reserveVertexSpace(layout, vertCnt, (void**)&verts)) {
+    if (!target->reserveVertexSpace(layout, vertCnt, (void**)&verts)) {
         return false;
     }
-    Vertex* base = verts;
 
     const GrMatrix* toDevice = NULL;
     const GrMatrix* toSrc = NULL;
@@ -661,7 +558,7 @@
         }
     }
 
-    for (int i = 0; i < fLineSegmentCnt; ++i) {
+    for (int i = 0; i < *lineCnt; ++i) {
         add_line(&lines[2*i], rtHeight, toSrc, &verts);
     }
 
@@ -671,25 +568,50 @@
         add_quads(&quads[3*i], qSubdivs[i], toDevice, toSrc, &verts);
     }
 
-    fPreviousStages = stageMask;
-    fPreviousViewMatrix = drawState.getViewMatrix();
-    fPreviousRTHeight = rtHeight;
-    fClipRect = clip;
-    fPreviousTranslate = fTranslate;
     return true;
 }
 
-void GrAAHairLinePathRenderer::drawPath(GrDrawState::StageMask stageMask) {
-
-    if (!this->createGeom(stageMask)) {
-        return;
+bool GrAAHairLinePathRenderer::canDrawPath(const SkPath& path,
+                                           GrPathFill fill,
+                                           const GrDrawTarget* target,
+                                           bool antiAlias) const {
+    if (fill != kHairLine_PathFill || !antiAlias) {
+        return false;
     }
 
-    GrDrawState* drawState = fTarget->drawState();
+    static const uint32_t gReqDerivMask = SkPath::kCubic_SegmentMask |
+                                          SkPath::kQuad_SegmentMask;
+    if (!target->getCaps().fShaderDerivativeSupport &&
+        (gReqDerivMask & path.getSegmentMasks())) {
+        return false;
+    }
+    return true;
+}
+
+bool GrAAHairLinePathRenderer::onDrawPath(const SkPath& path,
+                                          GrPathFill fill,
+                                          const GrVec* translate,
+                                          GrDrawTarget* target,
+                                          GrDrawState::StageMask stageMask,
+                                          bool antiAlias) {
+
+    int lineCnt;
+    int quadCnt;
+
+    if (!this->createGeom(path,
+                          translate,
+                          target,
+                          stageMask,
+                          &lineCnt,
+                          &quadCnt)) {
+        return false;
+    }
+
+    GrDrawState* drawState = target->drawState();
 
     GrDrawTarget::AutoStateRestore asr;
     if (!drawState->getViewMatrix().hasPerspective()) {
-        asr.set(fTarget);
+        asr.set(target);
         GrMatrix ivm;
         if (drawState->getViewInverse(&ivm)) {
             drawState->preConcatSamplerMatrices(stageMask, ivm);
@@ -699,32 +621,32 @@
 
     // TODO: See whether rendering lines as degenerate quads improves perf
     // when we have a mix
-    fTarget->setIndexSourceToBuffer(fLinesIndexBuffer);
+    target->setIndexSourceToBuffer(fLinesIndexBuffer);
     int lines = 0;
     int nBufLines = fLinesIndexBuffer->maxQuads();
-    while (lines < fLineSegmentCnt) {
-        int n = GrMin(fLineSegmentCnt-lines, nBufLines);
+    while (lines < lineCnt) {
+        int n = GrMin(lineCnt - lines, nBufLines);
         drawState->setVertexEdgeType(GrDrawState::kHairLine_EdgeType);
-        fTarget->drawIndexed(kTriangles_PrimitiveType,
-                             kVertsPerLineSeg*lines,    // startV
-                             0,                         // startI
-                             kVertsPerLineSeg*n,        // vCount
-                             kIdxsPerLineSeg*n);        // iCount
+        target->drawIndexed(kTriangles_PrimitiveType,
+                            kVertsPerLineSeg*lines,    // startV
+                            0,                         // startI
+                            kVertsPerLineSeg*n,        // vCount
+                            kIdxsPerLineSeg*n);        // iCount
         lines += n;
     }
 
-    fTarget->setIndexSourceToBuffer(fQuadsIndexBuffer);
+    target->setIndexSourceToBuffer(fQuadsIndexBuffer);
     int quads = 0;
-    while (quads < fQuadCnt) {
-        int n = GrMin(fQuadCnt-quads, kNumQuadsInIdxBuffer);
+    while (quads < quadCnt) {
+        int n = GrMin(quadCnt - quads, kNumQuadsInIdxBuffer);
         drawState->setVertexEdgeType(GrDrawState::kHairQuad_EdgeType);
-        fTarget->drawIndexed(kTriangles_PrimitiveType,
-                             4*fLineSegmentCnt + kVertsPerQuad*quads, // startV
-                             0,                                       // startI
-                             kVertsPerQuad*n,                         // vCount
-                             kIdxsPerQuad*n);                         // iCount
+        target->drawIndexed(kTriangles_PrimitiveType,
+                            4 * lineCnt + kVertsPerQuad*quads, // startV
+                            0,                                 // startI
+                            kVertsPerQuad*n,                   // vCount
+                            kIdxsPerQuad*n);                   // iCount
         quads += n;
     }
-
+    return true;
 }
 
diff --git a/src/gpu/GrAAHairLinePathRenderer.h b/src/gpu/GrAAHairLinePathRenderer.h
index 3b29919..33b7332 100644
--- a/src/gpu/GrAAHairLinePathRenderer.h
+++ b/src/gpu/GrAAHairLinePathRenderer.h
@@ -16,41 +16,35 @@
     virtual ~GrAAHairLinePathRenderer();
 
     static GrPathRenderer* Create(GrContext* context);
-    // GrPathRenderer overrides
-    virtual bool canDrawPath(const GrDrawTarget::Caps& targetCaps,
-                             const SkPath& path,
-                             GrPathFill fill,
-                             bool antiAlias) const  SK_OVERRIDE;
-    virtual void drawPath(GrDrawState::StageMask stages) SK_OVERRIDE;
 
+    virtual bool canDrawPath(const SkPath& path,
+                            GrPathFill fill,
+                            const GrDrawTarget* target,
+                            bool antiAlias) const SK_OVERRIDE;
 protected:
-
-    // GrPathRenderer overrides
-    virtual void pathWillClear()  SK_OVERRIDE;
-
+    virtual bool onDrawPath(const SkPath& path,
+                            GrPathFill fill,
+                            const GrVec* translate,
+                            GrDrawTarget* target,
+                            GrDrawState::StageMask stageMask,
+                            bool antiAlias) SK_OVERRIDE;
+ 
 private:
-    void resetGeom();
 
     GrAAHairLinePathRenderer(const GrContext* context,
                              const GrIndexBuffer* fLinesIndexBuffer,
                              const GrIndexBuffer* fQuadsIndexBuffer);
 
-    bool createGeom(GrDrawState::StageMask stages);
+    bool createGeom(const SkPath& path,
+                    const GrVec* translate,
+                    GrDrawTarget* target,
+                    GrDrawState::StageMask stageMask,
+                    int* lineCnt,
+                    int* quadCnt);
 
     const GrIndexBuffer*        fLinesIndexBuffer;
     const GrIndexBuffer*        fQuadsIndexBuffer;
 
-    // have to recreate geometry if stages in use changes :(
-    GrDrawState::StageMask      fPreviousStages;
-    int                         fPreviousRTHeight;
-    SkVector                    fPreviousTranslate;
-    GrIRect                     fClipRect;
-
-    // this path renderer draws everything in device coordinates
-    GrMatrix                    fPreviousViewMatrix;
-    int                         fLineSegmentCnt;
-    int                         fQuadCnt;
-
     typedef GrPathRenderer INHERITED;
 };
 
diff --git a/src/gpu/GrAddPathRenderers_aahairline.cpp b/src/gpu/GrAddPathRenderers_default.cpp
similarity index 79%
rename from src/gpu/GrAddPathRenderers_aahairline.cpp
rename to src/gpu/GrAddPathRenderers_default.cpp
index a7df66e..8f6eb1e 100644
--- a/src/gpu/GrAddPathRenderers_aahairline.cpp
+++ b/src/gpu/GrAddPathRenderers_default.cpp
@@ -1,20 +1,23 @@
 
 /*
- * Copyright 2011 Google Inc.
+ * Copyright 2012 Google Inc.
  *
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  */
 
-
+ 
 #include "GrAAHairLinePathRenderer.h"
+#include "GrAAConvexPathRenderer.h"
 
 void GrPathRenderer::AddPathRenderers(GrContext* ctx,
                                       GrPathRendererChain::UsageFlags flags,
                                       GrPathRendererChain* chain) {
     if (!(GrPathRendererChain::kNonAAOnly_UsageFlag & flags)) {
+
         if (GrPathRenderer* pr = GrAAHairLinePathRenderer::Create(ctx)) {
             chain->addPathRenderer(pr)->unref();
         }
+        chain->addPathRenderer(new GrAAConvexPathRenderer())->unref();
     }
 }
diff --git a/src/gpu/GrAtlas.cpp b/src/gpu/GrAtlas.cpp
index 17464ba..9553cf2 100644
--- a/src/gpu/GrAtlas.cpp
+++ b/src/gpu/GrAtlas.cpp
@@ -179,10 +179,10 @@
     if (NULL == fTexture[format]) {
         GrTextureDesc desc = {
             kDynamicUpdate_GrTextureFlagBit,
-            kNone_GrAALevel,
             GR_ATLAS_TEXTURE_WIDTH,
             GR_ATLAS_TEXTURE_HEIGHT,
-            maskformat2pixelconfig(format)
+            maskformat2pixelconfig(format),
+            {0} // samples
         };
         fTexture[format] = fGpu->createTexture(desc, NULL, 0);
         if (NULL == fTexture[format]) {
diff --git a/src/gpu/GrBufferAllocPool.cpp b/src/gpu/GrBufferAllocPool.cpp
index d029471..8bed75f 100644
--- a/src/gpu/GrBufferAllocPool.cpp
+++ b/src/gpu/GrBufferAllocPool.cpp
@@ -40,14 +40,13 @@
     fMinBlockSize = GrMax(GrBufferAllocPool_MIN_BLOCK_SIZE, blockSize);
 
     fBytesInUse = 0;
-            
+
     fPreallocBuffersInUse = 0;
-    fFirstPreallocBuffer = 0;
+    fPreallocBufferStartIdx = 0;
     for (int i = 0; i < preallocBufferCnt; ++i) {
         GrGeometryBuffer* buffer = this->createBuffer(fMinBlockSize);
         if (NULL != buffer) {
             *fPreallocBuffers.append() = buffer;
-            buffer->ref();
         }
     }
 }
@@ -83,13 +82,16 @@
             buffer->unlock();
         }
     }
+    // fPreallocBuffersInUse will be decremented down to zero in the while loop
+    int preallocBuffersInUse = fPreallocBuffersInUse;
     while (!fBlocks.empty()) {
-        destroyBlock();
+        this->destroyBlock();
     }
     if (fPreallocBuffers.count()) {
         // must set this after above loop.
-        fFirstPreallocBuffer = (fFirstPreallocBuffer + fPreallocBuffersInUse) %
-                               fPreallocBuffers.count();
+        fPreallocBufferStartIdx = (fPreallocBufferStartIdx +
+                                   preallocBuffersInUse) %
+                                  fPreallocBuffers.count();
     }
     // we may have created a large cpu mirror of a large VB. Reset the size
     // to match our pre-allocated VBs.
@@ -217,6 +219,12 @@
 void GrBufferAllocPool::putBack(size_t bytes) {
     VALIDATE();
 
+    // if the putBack unwinds all the preallocated buffers then we will
+    // advance the starting index. As blocks are destroyed fPreallocBuffersInUse
+    // will be decremented. I will reach zero if all blocks using preallocated
+    // buffers are released.
+    int preallocBuffersInUse = fPreallocBuffersInUse;
+
     while (bytes) {
         // caller shouldnt try to put back more than they've taken
         GrAssert(!fBlocks.empty());
@@ -238,6 +246,11 @@
             break;
         }
     }
+    if (!fPreallocBuffersInUse && fPreallocBuffers.count()) {
+            fPreallocBufferStartIdx = (fPreallocBufferStartIdx +
+                                       preallocBuffersInUse) %
+                                      fPreallocBuffers.count();
+    }
     VALIDATE();
 }
 
@@ -253,8 +266,9 @@
     if (size == fMinBlockSize &&
         fPreallocBuffersInUse < fPreallocBuffers.count()) {
 
-        uint32_t nextBuffer = (fPreallocBuffersInUse + fFirstPreallocBuffer) %
-                               fPreallocBuffers.count();
+        uint32_t nextBuffer = (fPreallocBuffersInUse +
+                               fPreallocBufferStartIdx) %
+                              fPreallocBuffers.count();
         block.fBuffer = fPreallocBuffers[nextBuffer];
         block.fBuffer->ref();
         ++fPreallocBuffersInUse;
@@ -302,7 +316,7 @@
     BufferBlock& block = fBlocks.back();
     if (fPreallocBuffersInUse > 0) {
         uint32_t prevPreallocBuffer = (fPreallocBuffersInUse +
-                                       fFirstPreallocBuffer +
+                                       fPreallocBufferStartIdx +
                                        (fPreallocBuffers.count() - 1)) %
                                       fPreallocBuffers.count();
         if (block.fBuffer == fPreallocBuffers[prevPreallocBuffer]) {
diff --git a/src/gpu/GrBufferAllocPool.h b/src/gpu/GrBufferAllocPool.h
index acf0289..3e2cd39 100644
--- a/src/gpu/GrBufferAllocPool.h
+++ b/src/gpu/GrBufferAllocPool.h
@@ -176,7 +176,9 @@
 
     SkTArray<BufferBlock>           fBlocks;
     int                             fPreallocBuffersInUse;
-    int                             fFirstPreallocBuffer;
+    // We attempt to cycle through the preallocated buffers rather than
+    // always starting from the first.
+    int                             fPreallocBufferStartIdx;
     SkAutoMalloc                    fCpuData;
     void*                           fBufferPtr;
 };
diff --git a/src/gpu/GrContext.cpp b/src/gpu/GrContext.cpp
index 1767f06..2a12399 100644
--- a/src/gpu/GrContext.cpp
+++ b/src/gpu/GrContext.cpp
@@ -22,10 +22,6 @@
 #include "SkTLazy.h"
 #include "SkTrace.h"
 
-// Using MSAA seems to be slower for some yet unknown reason.
-#define PREFER_MSAA_OFFSCREEN_AA 0
-#define OFFSCREEN_SSAA_SCALE 4 // super sample at 4x4
-
 #define DEFER_TEXT_RENDERING 1
 
 #define BATCH_RECT_TO_RECT (1 && !GR_STATIC_RECT_VB)
@@ -115,6 +111,10 @@
     GrSafeSetNull(fPathRendererChain);
 }
 
+size_t GrContext::getGpuTextureCacheBytes() const {
+  return fTextureCache->getCachedResourceBytes();
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 int GrContext::PaintStageVertexLayoutBits(
@@ -163,6 +163,7 @@
                             GrContext::TextureKey clientKey,
                             int width,
                             int height,
+                            int sampleCnt,
                             bool scratch,
                             uint32_t v[4]) {
     GR_STATIC_ASSERT(sizeof(GrContext::TextureKey) == sizeof(uint64_t));
@@ -174,7 +175,9 @@
     v[1] = (clientKey >> 32) & 0xffffffffUL;
     v[2] = width | (height << 16);
 
-    v[3] = 0;
+    v[3] = (sampleCnt << 24);
+    GrAssert(sampleCnt >= 0 && sampleCnt < 256);
+
     if (!gpu->getCaps().fNPOTTextureTileSupport) {
         bool isPow2 = GrIsPow2(width) && GrIsPow2(height);
 
@@ -215,23 +218,6 @@
                            sb->numSamples(), v);
 }
 
-// This should be subsumed by a future version of GrDrawState
-// It does not reset stage textures/samplers or per-vertex-edge-aa state since
-// they aren't used unless the vertex layout references them.
-// It also doesn't set the render target.
-void reset_draw_state(GrDrawState* drawState){
-
-        drawState->setViewMatrix(GrMatrix::I());
-        drawState->setColorFilter(0, SkXfermode::kDst_Mode);
-        drawState->resetStateFlags();
-        drawState->setEdgeAAData(NULL, 0);
-        drawState->disableStencil();
-        drawState->setAlpha(0xFF);
-        drawState->setBlendFunc(kOne_BlendCoeff,
-                           kZero_BlendCoeff);
-        drawState->setFirstCoverageStage(GrDrawState::kNumStages);
-        drawState->setDrawFace(GrDrawState::kBoth_DrawFace);
-}
 }
 
 GrContext::TextureCacheEntry GrContext::findAndLockTexture(
@@ -240,7 +226,7 @@
         int height,
         const GrSamplerState* sampler) {
     uint32_t v[4];
-    gen_texture_key_values(fGpu, sampler, key, width, height, false, v);
+    gen_texture_key_values(fGpu, sampler, key, width, height, 0, false, v);
     GrResourceKey resourceKey(v);
     return TextureCacheEntry(fTextureCache->findAndLock(resourceKey,
                                             GrResourceCache::kNested_LockType));
@@ -251,7 +237,7 @@
                                  int height,
                                  const GrSamplerState* sampler) const {
     uint32_t v[4];
-    gen_texture_key_values(fGpu, sampler, key, width, height, false, v);
+    gen_texture_key_values(fGpu, sampler, key, width, height, 0, false, v);
     GrResourceKey resourceKey(v);
     return fTextureCache->hasKey(resourceKey);
 }
@@ -326,7 +312,8 @@
     TextureCacheEntry entry;
     uint32_t v[4];
     bool special = gen_texture_key_values(fGpu, sampler, key,
-                                          desc.fWidth, desc.fHeight, false, v);
+                                          desc.fWidth, desc.fHeight,
+                                          desc.fSampleCnt, false, v);
     GrResourceKey resourceKey(v);
 
     if (special) {
@@ -356,7 +343,7 @@
         if (NULL != texture) {
             GrDrawTarget::AutoStateRestore asr(fGpu);
             GrDrawState* drawState = fGpu->drawState();
-            reset_draw_state(drawState);
+            drawState->reset();
             drawState->setRenderTarget(texture->asRenderTarget());
             drawState->setTexture(0, clampEntry.texture());
 
@@ -430,13 +417,12 @@
                                        uint32_t v[4]) {
     // Instead of a client-provided key of the texture contents
     // we create a key of from the descriptor.
-    GrContext::TextureKey descKey = desc.fAALevel |
-                                    (desc.fFlags << 8) |
+    GrContext::TextureKey descKey = (desc.fFlags << 8) |
                                     ((uint64_t) desc.fConfig << 32);
     // this code path isn't friendly to tiling with NPOT restricitons
     // We just pass ClampNoFilter()
     gen_texture_key_values(gpu, NULL, descKey, desc.fWidth,
-                           desc.fHeight, true, v);
+                           desc.fHeight, desc.fSampleCnt, true, v);
 }
 }
 
@@ -452,9 +438,6 @@
         desc.fHeight = GrMax(MIN_SIZE, GrNextPow2(desc.fHeight));
     }
 
-    uint32_t p0 = desc.fConfig;
-    uint32_t p1 = (desc.fAALevel << 16) | desc.fFlags;
-    
     GrResourceEntry* entry;
     int origWidth = desc.fWidth;
     int origHeight = desc.fHeight;
@@ -558,24 +541,6 @@
     return fGpu->createPlatformRenderTarget(desc);
 }
 
-GrResource* GrContext::createPlatformSurface(const GrPlatformSurfaceDesc& desc) {
-    // validate flags here so that GrGpu subclasses don't have to check
-    if (kTexture_GrPlatformSurfaceType == desc.fSurfaceType &&
-        0 != desc.fRenderTargetFlags) {
-        return NULL;
-    }
-    if (desc.fSampleCnt &&
-        (kGrCanResolve_GrPlatformRenderTargetFlagBit & desc.fRenderTargetFlags)) {
-        return NULL;
-    }
-    if (kTextureRenderTarget_GrPlatformSurfaceType == desc.fSurfaceType &&
-        desc.fSampleCnt &&
-        !(kGrCanResolve_GrPlatformRenderTargetFlagBit & desc.fRenderTargetFlags)) {
-        return NULL;
-    }
-    return fGpu->createPlatformSurface(desc);
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 
 bool GrContext::supportsIndex8PixelConfig(const GrSamplerState* sampler,
@@ -674,241 +639,6 @@
 }
 }
 
-struct GrContext::OffscreenRecord {
-    enum Downsample {
-        k4x4SinglePass_Downsample,
-        kFSAA_Downsample
-    }                              fDownsample;
-    int                            fTileSizeX;
-    int                            fTileSizeY;
-    int                            fTileCountX;
-    int                            fTileCountY;
-    int                            fScale;
-    GrAutoScratchTexture           fOffscreen;
-    GrDrawTarget::SavedDrawState   fSavedState;
-    GrClip                         fClip;
-};
-
-bool GrContext::doOffscreenAA(GrDrawTarget* target,
-                              bool isHairLines) const {
-#if !GR_USE_OFFSCREEN_AA
-    return false;
-#else
-    // Line primitves are always rasterized as 1 pixel wide.
-    // Super-sampling would make them too thin but MSAA would be OK.
-    if (isHairLines &&
-        (!PREFER_MSAA_OFFSCREEN_AA || !fGpu->getCaps().fFSAASupport)) {
-        return false;
-    }
-    if (target->getDrawState().getRenderTarget()->isMultisampled()) {
-        return false;
-    }
-    if (disable_coverage_aa_for_blend(target)) {
-#if GR_DEBUG
-        //GrPrintf("Turning off AA to correctly apply blend.\n");
-#endif
-        return false;
-    }
-    return true;
-#endif
-}
-
-bool GrContext::prepareForOffscreenAA(GrDrawTarget* target,
-                                      bool requireStencil,
-                                      const GrIRect& boundRect,
-                                      GrPathRenderer* pr,
-                                      OffscreenRecord* record) {
-
-    GrAssert(GR_USE_OFFSCREEN_AA);
-
-    GrAssert(NULL == record->fOffscreen.texture());
-    GrAssert(!boundRect.isEmpty());
-
-    int boundW = boundRect.width();
-    int boundH = boundRect.height();
-
-    GrTextureDesc desc;
-
-    desc.fWidth  = GrMin(fMaxOffscreenAASize, boundW);
-    desc.fHeight = GrMin(fMaxOffscreenAASize, boundH);
-
-    if (requireStencil) {
-        desc.fFlags = kRenderTarget_GrTextureFlagBit;
-    } else {
-        desc.fFlags = kRenderTarget_GrTextureFlagBit | 
-                      kNoStencil_GrTextureFlagBit;
-    }
-
-    desc.fConfig = kRGBA_8888_PM_GrPixelConfig;
-
-    if (PREFER_MSAA_OFFSCREEN_AA && fGpu->getCaps().fFSAASupport) {
-        record->fDownsample = OffscreenRecord::kFSAA_Downsample;
-        record->fScale = 1;
-        desc.fAALevel = kMed_GrAALevel;
-    } else {
-        record->fDownsample = OffscreenRecord::k4x4SinglePass_Downsample;
-        record->fScale = OFFSCREEN_SSAA_SCALE;
-        // both downsample paths assume this
-        GR_STATIC_ASSERT(4 == OFFSCREEN_SSAA_SCALE);
-        desc.fAALevel = kNone_GrAALevel;
-    }
-    
-    desc.fWidth *= record->fScale;
-    desc.fHeight *= record->fScale;
-    record->fOffscreen.set(this, desc);
-    if (NULL == record->fOffscreen.texture()) {
-        return false;
-    }
-    // the approximate lookup might have given us some slop space, might as well
-    // use it when computing the tiles size.
-    // these are scale values, will adjust after considering
-    // the possible second offscreen.
-    record->fTileSizeX = record->fOffscreen.texture()->width();
-    record->fTileSizeY = record->fOffscreen.texture()->height();
-
-    record->fTileSizeX /= record->fScale;
-    record->fTileSizeY /= record->fScale;
-
-    record->fTileCountX = GrIDivRoundUp(boundW, record->fTileSizeX);
-    record->fTileCountY = GrIDivRoundUp(boundH, record->fTileSizeY);
-
-    record->fClip = target->getClip();
-
-    target->saveCurrentDrawState(&record->fSavedState);
-    return true;
-}
-
-void GrContext::setupOffscreenAAPass1(GrDrawTarget* target,
-                                      const GrIRect& boundRect,
-                                      int tileX, int tileY,
-                                      OffscreenRecord* record) {
-
-    GrRenderTarget* offRT = record->fOffscreen.texture()->asRenderTarget();
-    GrAssert(NULL != offRT);
-
-    GrPaint tempPaint;
-    tempPaint.reset();
-    this->setPaint(tempPaint, target);
-    GrDrawState* drawState = target->drawState();
-    drawState->setRenderTarget(offRT);
-#if PREFER_MSAA_OFFSCREEN_AA
-    drawState->enableState(GrDrawState::kHWAntialias_StateBit);
-#endif
-
-    GrMatrix transM;
-    int left = boundRect.fLeft + tileX * record->fTileSizeX;
-    int top =  boundRect.fTop  + tileY * record->fTileSizeY;
-    transM.setTranslate(-left * GR_Scalar1, -top * GR_Scalar1);
-    drawState->viewMatrix()->postConcat(transM);
-    GrMatrix scaleM;
-    scaleM.setScale(record->fScale * GR_Scalar1, record->fScale * GR_Scalar1);
-    drawState->viewMatrix()->postConcat(scaleM);
-
-    int w = (tileX == record->fTileCountX-1) ? boundRect.fRight - left :
-                                               record->fTileSizeX;
-    int h = (tileY == record->fTileCountY-1) ? boundRect.fBottom - top :
-                                               record->fTileSizeY;
-    GrIRect clear = SkIRect::MakeWH(record->fScale * w, 
-                                    record->fScale * h);
-    target->setClip(GrClip(clear));
-#if 0
-    // visualize tile boundaries by setting edges of offscreen to white
-    // and interior to tranparent. black.
-    target->clear(&clear, 0xffffffff);
-
-    static const int gOffset = 2;
-    GrIRect clear2 = SkIRect::MakeLTRB(gOffset, gOffset,
-                                       record->fScale * w - gOffset,
-                                       record->fScale * h - gOffset);
-    target->clear(&clear2, 0x0);
-#else
-    target->clear(&clear, 0x0);
-#endif
-}
-
-void GrContext::doOffscreenAAPass2(GrDrawTarget* target,
-                                 const GrPaint& paint,
-                                 const GrIRect& boundRect,
-                                 int tileX, int tileY,
-                                 OffscreenRecord* record) {
-    SK_TRACE_EVENT0("GrContext::doOffscreenAAPass2");
-    GrAssert(NULL != record->fOffscreen.texture());
-    GrDrawTarget::AutoGeometryPush agp(target);
-    GrIRect tileRect;
-    tileRect.fLeft = boundRect.fLeft + tileX * record->fTileSizeX;
-    tileRect.fTop  = boundRect.fTop  + tileY * record->fTileSizeY,
-    tileRect.fRight = (tileX == record->fTileCountX-1) ? 
-                        boundRect.fRight :
-                        tileRect.fLeft + record->fTileSizeX;
-    tileRect.fBottom = (tileY == record->fTileCountY-1) ? 
-                        boundRect.fBottom :
-                        tileRect.fTop + record->fTileSizeY;
-
-    GrSamplerState::Filter filter;
-    if (OffscreenRecord::k4x4SinglePass_Downsample == record->fDownsample) {
-        filter = GrSamplerState::k4x4Downsample_Filter;
-    } else {
-        filter = GrSamplerState::kBilinear_Filter;
-    }
-
-    GrTexture* src = record->fOffscreen.texture();
-    int scale;
-
-    enum {
-        kOffscreenStage = GrPaint::kTotalStages,
-    };
-
-    GrDrawState* drawState = target->drawState();
-
-    if (OffscreenRecord::kFSAA_Downsample == record->fDownsample) {
-        scale = 1;
-        GrIRect rect = SkIRect::MakeWH(tileRect.width(), tileRect.height());
-        src->asRenderTarget()->overrideResolveRect(rect);
-    } else {
-        GrAssert(OffscreenRecord::k4x4SinglePass_Downsample == 
-                 record->fDownsample);
-        scale = 4;
-    }
-
-    // setup for draw back to main RT, we use the original
-    // draw state setup by the caller plus an additional coverage
-    // stage to handle the AA resolve. Also, we use an identity
-    // view matrix and so pre-concat sampler matrices with view inv.
-    int stageMask = paint.getActiveStageMask();
-
-    target->restoreDrawState(record->fSavedState);
-    target->setClip(record->fClip);
-
-    if (stageMask) {
-        GrMatrix invVM;
-        if (drawState->getViewInverse(&invVM)) {
-            drawState->preConcatSamplerMatrices(stageMask, invVM);
-        }
-    }
-    // This is important when tiling, otherwise second tile's 
-    // pass 1 view matrix will be incorrect.
-    GrDrawState::AutoViewMatrixRestore avmr(drawState, GrMatrix::I());
-
-    drawState->setTexture(kOffscreenStage, src);
-    GrSamplerState* sampler = drawState->sampler(kOffscreenStage);
-    sampler->reset(GrSamplerState::kClamp_WrapMode, filter);
-    sampler->matrix()->setScale(scale * GR_Scalar1 / src->width(),
-                                scale * GR_Scalar1 / src->height());
-    sampler->matrix()->preTranslate(SkIntToScalar(-tileRect.fLeft),
-                                    SkIntToScalar(-tileRect.fTop));
-
-    GrRect dstRect;
-    int stages = (1 << kOffscreenStage) | stageMask;
-    dstRect.set(tileRect);
-    target->drawSimpleRect(dstRect, NULL, stages);
-}
-
-void GrContext::cleanupOffscreenAA(GrDrawTarget* target,
-                                   GrPathRenderer* pr,
-                                   OffscreenRecord* record) {
-    target->restoreDrawState(record->fSavedState);
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 
 /*  create a triangle strip that strokes the specified triangle. There are 8
@@ -1157,9 +887,9 @@
                              GrMatrix* combinedMatrix,
                              GrRect* devRect,
                              bool* useVertexCoverage) {
-    // we use a simple alpha ramp to do aa on axis-aligned rects
-    // do AA with alpha ramp if the caller requested AA, the rect 
-    // will be axis-aligned, and the rect won't land on integer coords.
+    // we use a simple coverage ramp to do aa on axis-aligned rects
+    // we check if the rect will be axis-aligned, and the rect won't land on 
+    // integer coords.
 
     // we are keeping around the "tweak the alpha" trick because
     // it is our only hope for the fixed-pipe implementation.
@@ -1167,18 +897,13 @@
     // TODO: remove this ugliness when we drop the fixed-pipe impl
     *useVertexCoverage = false;
     if (!target->canTweakAlphaForCoverage()) {
-        if (target->getCaps().fSupportPerVertexCoverage) {
-            if (disable_coverage_aa_for_blend(target)) {
+        if (disable_coverage_aa_for_blend(target)) {
 #if GR_DEBUG
-                //GrPrintf("Turning off AA to correctly apply blend.\n");
+            //GrPrintf("Turning off AA to correctly apply blend.\n");
 #endif
-                return false;
-            } else {
-                *useVertexCoverage = true;
-            }
-        } else {
-            GrPrintf("Rect AA dropped because no support for coverage.\n");
             return false;
+        } else {
+            *useVertexCoverage = true;
         }
     }
     const GrDrawState& drawState = target->getDrawState();
@@ -1460,6 +1185,173 @@
 }
 
 ///////////////////////////////////////////////////////////////////////////////
+#include "SkDraw.h"
+#include "SkRasterClip.h"
+
+namespace {
+
+SkPath::FillType gr_fill_to_sk_fill(GrPathFill fill) {
+    switch (fill) {
+        case kWinding_PathFill:
+            return SkPath::kWinding_FillType;
+        case kEvenOdd_PathFill:
+            return SkPath::kEvenOdd_FillType;
+        case kInverseWinding_PathFill:
+            return SkPath::kInverseWinding_FillType;
+        case kInverseEvenOdd_PathFill:
+            return SkPath::kInverseEvenOdd_FillType;
+        default:
+            GrCrash("Unexpected fill.");
+            return SkPath::kWinding_FillType;
+    }
+}
+
+// gets device coord bounds of path (not considering the fill) and clip. The
+// path bounds will be a subset of the clip bounds. returns false if path bounds
+// would be empty.
+bool get_path_and_clip_bounds(const GrDrawTarget* target,
+                              const GrPath& path,
+                              const GrVec* translate,
+                              GrIRect* pathBounds,
+                              GrIRect* clipBounds) {
+    // compute bounds as intersection of rt size, clip, and path
+    const GrRenderTarget* rt = target->getDrawState().getRenderTarget();
+    if (NULL == rt) {
+        return false;
+    }
+    *pathBounds = GrIRect::MakeWH(rt->width(), rt->height());
+    const GrClip& clip = target->getClip();
+    if (clip.hasConservativeBounds()) {
+        clip.getConservativeBounds().roundOut(clipBounds);
+        if (!pathBounds->intersect(*clipBounds)) {
+            return false;
+        }
+    } else {
+        // pathBounds is currently the rt extent, set clip bounds to that rect.
+        *clipBounds = *pathBounds;
+    }
+    GrRect pathSBounds = path.getBounds();
+    if (!pathSBounds.isEmpty()) {
+        if (NULL != translate) {
+            pathSBounds.offset(*translate);
+        }
+        target->getDrawState().getViewMatrix().mapRect(&pathSBounds,
+                                                        pathSBounds);
+        GrIRect pathIBounds;
+        pathSBounds.roundOut(&pathIBounds);
+        if (!pathBounds->intersect(pathIBounds)) {
+            return false;
+        }
+    } else {
+        return false;
+    }
+    return true;
+}
+
+/**
+ * sw rasterizes path to A8 mask using the context's matrix and uploads to a 
+ * scratch texture.
+ */
+
+bool sw_draw_path_to_mask_texture(const GrPath& clientPath,
+                                  const GrIRect& pathDevBounds,
+                                  GrPathFill fill,
+                                  GrContext* context,
+                                  const GrPoint* translate,
+                                  GrAutoScratchTexture* tex) {
+    SkPaint paint;
+    SkPath tmpPath;
+    const SkPath* pathToDraw = &clientPath;
+    if (kHairLine_PathFill == fill) {
+        paint.setStyle(SkPaint::kStroke_Style);
+        paint.setStrokeWidth(SK_Scalar1);
+    } else {
+        paint.setStyle(SkPaint::kFill_Style);
+        SkPath::FillType skfill = gr_fill_to_sk_fill(fill);
+        if (skfill != pathToDraw->getFillType()) {
+            tmpPath = *pathToDraw;
+            tmpPath.setFillType(skfill);
+            pathToDraw = &tmpPath;
+        }
+    }
+    paint.setAntiAlias(true);
+    paint.setColor(SK_ColorWHITE);
+
+    GrMatrix matrix = context->getMatrix();
+    if (NULL != translate) {
+        matrix.postTranslate(translate->fX, translate->fY);
+    }
+
+    matrix.postTranslate(-pathDevBounds.fLeft * SK_Scalar1,
+                         -pathDevBounds.fTop * SK_Scalar1);
+    GrIRect bounds = GrIRect::MakeWH(pathDevBounds.width(),
+                                     pathDevBounds.height());
+
+    SkBitmap bm;
+    bm.setConfig(SkBitmap::kA8_Config, bounds.fRight, bounds.fBottom);
+    if (!bm.allocPixels()) {
+        return false;
+    }
+    sk_bzero(bm.getPixels(), bm.getSafeSize());
+
+    SkDraw  draw;
+    sk_bzero(&draw, sizeof(draw));
+    SkRasterClip rc(bounds);
+    draw.fRC    = &rc;
+    draw.fClip  = &rc.bwRgn();
+    draw.fMatrix = &matrix;
+    draw.fBitmap = &bm;
+    draw.drawPath(*pathToDraw, paint);
+
+    const GrTextureDesc desc = {
+        kNone_GrTextureFlags,
+        bounds.fRight,
+        bounds.fBottom,
+        kAlpha_8_GrPixelConfig,
+        {0} // samples
+    };
+
+    tex->set(context, desc);
+    GrTexture* texture = tex->texture();
+
+    if (NULL == texture) {
+        return false;
+    }
+    SkAutoLockPixels alp(bm);
+    texture->writePixels(0, 0, desc.fWidth, desc.fHeight, desc.fConfig,
+                         bm.getPixels(), bm.rowBytes());
+    return true;
+}
+
+void draw_around_inv_path(GrDrawTarget* target,
+                          GrDrawState::StageMask stageMask,
+                          const GrIRect& clipBounds,
+                          const GrIRect& pathBounds) {
+    GrDrawTarget::AutoDeviceCoordDraw adcd(target, stageMask);
+    GrRect rect;
+    if (clipBounds.fTop < pathBounds.fTop) {
+        rect.iset(clipBounds.fLeft, clipBounds.fTop, 
+                    clipBounds.fRight, pathBounds.fTop);
+        target->drawSimpleRect(rect, NULL, stageMask);
+    }
+    if (clipBounds.fLeft < pathBounds.fLeft) {
+        rect.iset(clipBounds.fLeft, pathBounds.fTop, 
+                    pathBounds.fLeft, pathBounds.fBottom);
+        target->drawSimpleRect(rect, NULL, stageMask);
+    }
+    if (clipBounds.fRight > pathBounds.fRight) {
+        rect.iset(pathBounds.fRight, pathBounds.fTop, 
+                    clipBounds.fRight, pathBounds.fBottom);
+        target->drawSimpleRect(rect, NULL, stageMask);
+    }
+    if (clipBounds.fBottom > pathBounds.fBottom) {
+        rect.iset(clipBounds.fLeft, pathBounds.fBottom, 
+                    clipBounds.fRight, clipBounds.fBottom);
+        target->drawSimpleRect(rect, NULL, stageMask);
+    }
+}
+
+}
 
 void GrContext::drawPath(const GrPaint& paint, const GrPath& path,
                          GrPathFill fill, const GrPoint* translate) {
@@ -1472,6 +1364,7 @@
     }
 
     GrDrawTarget* target = this->prepareToDraw(paint, kUnbuffered_DrawCategory);
+    GrDrawState::StageMask stageMask = paint.getActiveStageMask();
 
     bool prAA = paint.fAntiAlias && !this->getRenderTarget()->isMultisampled();
 
@@ -1486,17 +1379,50 @@
         prAA = false;
     }
 
-    bool doOSAA = false;
     GrPathRenderer* pr = NULL;
     if (prAA) {
-        pr = this->getPathRenderer(path, fill, true);
+        pr = this->getPathRenderer(path, fill, target, true);
         if (NULL == pr) {
-            prAA = false;
-            doOSAA = this->doOffscreenAA(target, kHairLine_PathFill == fill);
-            pr = this->getPathRenderer(path, fill, false);
+            GrAutoScratchTexture ast;
+            GrIRect pathBounds, clipBounds;
+            if (!get_path_and_clip_bounds(target, path, translate,
+                                          &pathBounds, &clipBounds)) {
+                return;
+            }
+            if (NULL == pr && sw_draw_path_to_mask_texture(path, pathBounds,
+                                                           fill, this,
+                                                           translate, &ast)) {
+                GrTexture* texture = ast.texture();
+                GrAssert(NULL != texture);
+                GrDrawTarget::AutoDeviceCoordDraw adcd(target, stageMask);
+                enum {
+                    kPathMaskStage = GrPaint::kTotalStages,
+                };
+                target->drawState()->setTexture(kPathMaskStage, texture);
+                target->drawState()->sampler(kPathMaskStage)->reset();
+                GrScalar w = GrIntToScalar(pathBounds.width());
+                GrScalar h = GrIntToScalar(pathBounds.height());
+                GrRect maskRect = GrRect::MakeWH(w / texture->width(),
+                                                 h / texture->height());
+                const GrRect* srcRects[GrDrawState::kNumStages] = {NULL};
+                srcRects[kPathMaskStage] = &maskRect;
+                stageMask |= 1 << kPathMaskStage;
+                GrRect dstRect = GrRect::MakeLTRB(
+                    SK_Scalar1* pathBounds.fLeft,
+                    SK_Scalar1* pathBounds.fTop,
+                    SK_Scalar1* pathBounds.fRight,
+                    SK_Scalar1* pathBounds.fBottom);
+                target->drawRect(dstRect, NULL, stageMask, srcRects, NULL);
+                target->drawState()->setTexture(kPathMaskStage, NULL);
+                if (GrIsFillInverted(fill)) {
+                    draw_around_inv_path(target, stageMask,
+                                         clipBounds, pathBounds);
+                }
+                return;
+            }
         }
     } else {
-        pr = this->getPathRenderer(path, fill, false);
+        pr = this->getPathRenderer(path, fill, target, false);
     }
 
     if (NULL == pr) {
@@ -1506,73 +1432,7 @@
         return;
     }
 
-    GrPathRenderer::AutoClearPath arp(pr, target, &path, fill, prAA, translate);
-    GrDrawState::StageMask stageMask = paint.getActiveStageMask();
-
-    if (doOSAA) {
-        bool needsStencil = pr->requiresStencilPass(target, path, fill);
-        const GrRenderTarget* rt = target->getDrawState().getRenderTarget();
-        // compute bounds as intersection of rt size, clip, and path
-        GrIRect bound = SkIRect::MakeWH(rt->width(), rt->height());
-        GrIRect clipIBounds;
-        if (target->getClip().hasConservativeBounds()) {
-            target->getClip().getConservativeBounds().roundOut(&clipIBounds);
-            if (!bound.intersect(clipIBounds)) {
-                return;
-            }
-        }
-        GrRect pathBounds = path.getBounds();
-        if (!pathBounds.isEmpty()) {
-            if (NULL != translate) {
-                pathBounds.offset(*translate);
-            }
-            target->getDrawState().getViewMatrix().mapRect(&pathBounds,
-                                                           pathBounds);
-            GrIRect pathIBounds;
-            pathBounds.roundOut(&pathIBounds);
-            if (!bound.intersect(pathIBounds)) {
-                return;
-            }
-        }
-        OffscreenRecord record;
-        if (this->prepareForOffscreenAA(target, needsStencil, bound,
-                                        pr, &record)) {
-            for (int tx = 0; tx < record.fTileCountX; ++tx) {
-                for (int ty = 0; ty < record.fTileCountY; ++ty) {
-                    this->setupOffscreenAAPass1(target, bound, tx, ty, &record);
-                    pr->drawPath(0);
-                    this->doOffscreenAAPass2(target, paint, bound, tx, ty, &record);
-                }
-            }
-            this->cleanupOffscreenAA(target, pr, &record);
-            if (GrIsFillInverted(fill) && bound != clipIBounds) {
-                GrDrawTarget::AutoDeviceCoordDraw adcd(target, stageMask);
-                GrRect rect;
-                if (clipIBounds.fTop < bound.fTop) {
-                    rect.iset(clipIBounds.fLeft, clipIBounds.fTop, 
-                              clipIBounds.fRight, bound.fTop);
-                    target->drawSimpleRect(rect, NULL, stageMask);
-                }
-                if (clipIBounds.fLeft < bound.fLeft) {
-                    rect.iset(clipIBounds.fLeft, bound.fTop, 
-                              bound.fLeft, bound.fBottom);
-                    target->drawSimpleRect(rect, NULL, stageMask);
-                }
-                if (clipIBounds.fRight > bound.fRight) {
-                    rect.iset(bound.fRight, bound.fTop, 
-                              clipIBounds.fRight, bound.fBottom);
-                    target->drawSimpleRect(rect, NULL, stageMask);
-                }
-                if (clipIBounds.fBottom > bound.fBottom) {
-                    rect.iset(clipIBounds.fLeft, bound.fBottom, 
-                              clipIBounds.fRight, clipIBounds.fBottom);
-                    target->drawSimpleRect(rect, NULL, stageMask);
-                }
-            }
-            return;
-        }
-    } 
-    pr->drawPath(stageMask);
+    pr->drawPath(path, fill, translate, target, stageMask, prAA);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -1648,6 +1508,35 @@
     }
 }
 
+#include "SkConfig8888.h"
+
+namespace {
+/**
+ * Converts a GrPixelConfig to a SkCanvas::Config8888. Only byte-per-channel
+ * formats are representable as Config8888 and so the function returns false
+ * if the GrPixelConfig has no equivalent Config8888.
+ */
+bool grconfig_to_config8888(GrPixelConfig config,
+                            SkCanvas::Config8888* config8888) {
+    switch (config) {
+        case kRGBA_8888_PM_GrPixelConfig:
+            *config8888 = SkCanvas::kRGBA_Premul_Config8888;
+            return true;
+        case kRGBA_8888_UPM_GrPixelConfig:
+            *config8888 = SkCanvas::kRGBA_Unpremul_Config8888;
+            return true;
+        case kBGRA_8888_PM_GrPixelConfig:
+            *config8888 = SkCanvas::kBGRA_Premul_Config8888;
+            return true;
+        case kBGRA_8888_UPM_GrPixelConfig:
+            *config8888 = SkCanvas::kBGRA_Unpremul_Config8888;
+            return true;
+        default:
+            return false;
+    }
+}
+}
+
 bool GrContext::internalReadRenderTargetPixels(GrRenderTarget* target,
                                                int left, int top,
                                                int width, int height,
@@ -1664,19 +1553,34 @@
             return false;
         }
     }
-    
-    // PM <-> UPM conversion requires a draw. Currently we only support drawing
-    // into a UPM target, not reading from a UPM texture. Thus, UPM->PM is not
-    // not supported at this time.
-    if (GrPixelConfigIsUnpremultiplied(target->config()) && 
-        !GrPixelConfigIsUnpremultiplied(config)) {
-        return false;
-    }
 
     if (!(kDontFlush_PixelOpsFlag & flags)) {
         this->flush();
     }
 
+    if (!GrPixelConfigIsUnpremultiplied(target->config()) &&
+        GrPixelConfigIsUnpremultiplied(config) &&
+        !fGpu->canPreserveReadWriteUnpremulPixels()) {
+        SkCanvas::Config8888 srcConfig8888, dstConfig8888;
+        if (!grconfig_to_config8888(target->config(), &srcConfig8888) ||
+            !grconfig_to_config8888(config, &dstConfig8888)) {
+            return false;
+        }
+        // do read back using target's own config
+        this->internalReadRenderTargetPixels(target,
+                                             left, top,
+                                             width, height,
+                                             target->config(),
+                                             buffer, rowBytes,
+                                             kDontFlush_PixelOpsFlag);
+        // sw convert the pixels to unpremul config
+        uint32_t* pixels = reinterpret_cast<uint32_t*>(buffer);
+        SkConvertConfig8888Pixels(pixels, rowBytes, dstConfig8888,
+                                  pixels, rowBytes, srcConfig8888,
+                                  width, height);
+        return true;
+    }
+
     GrTexture* src = target->asTexture();
     bool swapRAndB = NULL != src &&
                      fGpu->preferredReadPixelsConfig(config) ==
@@ -1707,9 +1611,9 @@
         // readTexturePixels as of yet (it calls this function).
         const GrTextureDesc desc = {
             kRenderTarget_GrTextureFlagBit,
-            kNone_GrAALevel,
             width, height,
-            config
+            config,
+            {0}, // samples
         };
 
         // When a full readback is faster than a partial we could always make
@@ -1735,7 +1639,7 @@
 
         GrDrawTarget::AutoStateRestore asr(fGpu);
         GrDrawState* drawState = fGpu->drawState();
-        reset_draw_state(drawState);
+        drawState->reset();
         drawState->setRenderTarget(target);
 
         GrMatrix matrix;
@@ -1761,6 +1665,16 @@
                             config, buffer, rowBytes, flipY);
 }
 
+void GrContext::resolveRenderTarget(GrRenderTarget* target) {
+    GrAssert(target);
+    ASSERT_OWNED_RESOURCE(target);
+    // In the future we may track whether there are any pending draws to this
+    // target. We don't today so we always perform a flush. We don't promise
+    // this to our clients, though.
+    this->flush();
+    fGpu->resolveRenderTarget(target);
+}
+
 void GrContext::copyTexture(GrTexture* src, GrRenderTarget* dst) {
     if (NULL == src || NULL == dst) {
         return;
@@ -1769,13 +1683,15 @@
 
     GrDrawTarget::AutoStateRestore asr(fGpu);
     GrDrawState* drawState = fGpu->drawState();
-    reset_draw_state(drawState);
+    drawState->reset();
     drawState->setRenderTarget(dst);
     GrMatrix sampleM;
     sampleM.setIDiv(src->width(), src->height());
     drawState->setTexture(0, src);
     drawState->sampler(0)->reset(sampleM);
-    SkRect rect = SkRect::MakeXYWH(0, 0, src->width(), src->height());
+    SkRect rect = SkRect::MakeXYWH(0, 0,
+                                   SK_Scalar1 * src->width(),
+                                   SK_Scalar1 * src->height());
     fGpu->drawSimpleRect(rect, NULL, 1 << 0);
 }
 
@@ -1817,6 +1733,28 @@
         return;
     }
 #endif
+    if (!GrPixelConfigIsUnpremultiplied(target->config()) &&
+        GrPixelConfigIsUnpremultiplied(config) &&
+        !fGpu->canPreserveReadWriteUnpremulPixels()) {
+        SkCanvas::Config8888 srcConfig8888, dstConfig8888;
+        if (!grconfig_to_config8888(config, &srcConfig8888) ||
+            !grconfig_to_config8888(target->config(), &dstConfig8888)) {
+            return;
+        }
+        // allocate a tmp buffer and sw convert the pixels to premul
+        SkAutoSTMalloc<128 * 128, uint32_t> tmpPixels(width * height);
+        const uint32_t* src = reinterpret_cast<const uint32_t*>(buffer);
+        SkConvertConfig8888Pixels(tmpPixels.get(), 4 * width, dstConfig8888,
+                                  src, rowBytes, srcConfig8888,
+                                  width, height);
+        // upload the already premul pixels
+        this->internalWriteRenderTargetPixels(target,
+                                             left, top,
+                                             width, height,
+                                             target->config(),
+                                             tmpPixels, 4 * width, flags);
+        return;
+    }
 
     bool swapRAndB = fGpu->preferredReadPixelsConfig(config) ==
                      GrPixelConfigSwapRAndB(config);
@@ -1825,7 +1763,7 @@
     }
 
     const GrTextureDesc desc = {
-        kNone_GrTextureFlags, kNone_GrAALevel, width, height, config
+        kNone_GrTextureFlags, width, height, config, {0}
     };
     GrAutoScratchTexture ast(this, desc);
     GrTexture* texture = ast.texture();
@@ -1837,7 +1775,7 @@
 
     GrDrawTarget::AutoStateRestore  asr(fGpu);
     GrDrawState* drawState = fGpu->drawState();
-    reset_draw_state(drawState);
+    drawState->reset();
 
     GrMatrix matrix;
     matrix.setTranslate(GrIntToScalar(left), GrIntToScalar(top));
@@ -1907,6 +1845,7 @@
     drawState->setBlendFunc(paint.fSrcBlendCoeff, paint.fDstBlendCoeff);
     drawState->setColorFilter(paint.fColorFilterColor, paint.fColorFilterXfermode);
     drawState->setColorMatrix(paint.fColorMatrix);
+    drawState->setCoverage(paint.fCoverage);
 
     if (paint.getActiveMaskStageMask() && !target->canApplyCoverage()) {
         GrPrintf("Partial pixel coverage will be incorrectly blended.\n");
@@ -1943,13 +1882,13 @@
 
 GrPathRenderer* GrContext::getPathRenderer(const GrPath& path,
                                            GrPathFill fill,
+                                           const GrDrawTarget* target,
                                            bool antiAlias) {
     if (NULL == fPathRendererChain) {
         fPathRendererChain = 
             new GrPathRendererChain(this, GrPathRendererChain::kNone_UsageFlag);
     }
-    return fPathRendererChain->getPathRenderer(fGpu->getCaps(), path,
-                                               fill, antiAlias);
+    return fPathRendererChain->getPathRenderer(path, fill, target, antiAlias);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -2022,12 +1961,6 @@
     fAAFillRectIndexBuffer = NULL;
     fAAStrokeRectIndexBuffer = NULL;
     
-    int gpuMaxOffscreen = gpu->getCaps().fMaxRenderTargetSize;
-    if (!PREFER_MSAA_OFFSCREEN_AA || !gpu->getCaps().fFSAASupport) {
-        gpuMaxOffscreen /= OFFSCREEN_SSAA_SCALE;
-    }
-    fMaxOffscreenAASize = GrMin(GR_MAX_OFFSCREEN_AA_SIZE, gpuMaxOffscreen);
-
     this->setupDrawBuffer();
 }
 
@@ -2072,48 +2005,51 @@
     return fGpu->getQuadIndexBuffer();
 }
 
-void GrContext::convolveInX(GrTexture* texture,
-                            const SkRect& rect,
-                            const float* kernel,
-                            int kernelWidth) {
-    ASSERT_OWNED_RESOURCE(texture);
-
-    float imageIncrement[2] = {1.0f / texture->width(), 0.0f};
-    convolve(texture, rect, imageIncrement, kernel, kernelWidth);
-}
-
-void GrContext::convolveInY(GrTexture* texture,
-                            const SkRect& rect,
-                            const float* kernel,
-                            int kernelWidth) {
-    ASSERT_OWNED_RESOURCE(texture);
-
-    float imageIncrement[2] = {0.0f, 1.0f / texture->height()};
-    convolve(texture, rect, imageIncrement, kernel, kernelWidth);
-}
-
 void GrContext::convolve(GrTexture* texture,
                          const SkRect& rect,
-                         float imageIncrement[2],
                          const float* kernel,
-                         int kernelWidth) {
+                         int kernelWidth,
+                         GrSamplerState::FilterDirection direction) {
     ASSERT_OWNED_RESOURCE(texture);
 
     GrDrawTarget::AutoStateRestore asr(fGpu);
     GrDrawState* drawState = fGpu->drawState();
+    GrRenderTarget* target = drawState->getRenderTarget();
+    drawState->reset();
+    drawState->setRenderTarget(target);
     GrMatrix sampleM;
     sampleM.setIDiv(texture->width(), texture->height());
     drawState->sampler(0)->reset(GrSamplerState::kClamp_WrapMode,
                                  GrSamplerState::kConvolution_Filter,
                                  sampleM);
-    drawState->sampler(0)->setConvolutionParams(kernelWidth,
-                                                kernel,
-                                                imageIncrement);
-
-    drawState->setViewMatrix(GrMatrix::I());
+    drawState->sampler(0)->setConvolutionParams(kernelWidth, kernel);
+    drawState->sampler(0)->setFilterDirection(direction);
     drawState->setTexture(0, texture);
-    drawState->setAlpha(0xFF);
-    drawState->setBlendFunc(kOne_BlendCoeff, kZero_BlendCoeff);
+    fGpu->drawSimpleRect(rect, NULL, 1 << 0);
+}
+
+void GrContext::applyMorphology(GrTexture* texture,
+                                const SkRect& rect,
+                                int radius,
+                                GrSamplerState::Filter filter,
+                                GrSamplerState::FilterDirection direction) {
+    ASSERT_OWNED_RESOURCE(texture);
+    GrAssert(filter == GrSamplerState::kErode_Filter ||
+             filter == GrSamplerState::kDilate_Filter);
+
+    GrDrawTarget::AutoStateRestore asr(fGpu);
+    GrDrawState* drawState = fGpu->drawState();
+    GrRenderTarget* target = drawState->getRenderTarget();
+    drawState->reset();
+    drawState->setRenderTarget(target);
+    GrMatrix sampleM;
+    sampleM.setIDiv(texture->width(), texture->height());
+    drawState->sampler(0)->reset(GrSamplerState::kClamp_WrapMode,
+                                 filter,
+                                 sampleM);
+    drawState->sampler(0)->setMorphologyRadius(radius);
+    drawState->sampler(0)->setFilterDirection(direction);
+    drawState->setTexture(0, texture);
     fGpu->drawSimpleRect(rect, NULL, 1 << 0);
 }
 
diff --git a/src/gpu/GrDefaultPathRenderer.cpp b/src/gpu/GrDefaultPathRenderer.cpp
index 6ea0459..72b3c60 100644
--- a/src/gpu/GrDefaultPathRenderer.cpp
+++ b/src/gpu/GrDefaultPathRenderer.cpp
@@ -18,21 +18,7 @@
 GrDefaultPathRenderer::GrDefaultPathRenderer(bool separateStencilSupport,
                                              bool stencilWrapOpsSupport)
     : fSeparateStencil(separateStencilSupport)
-    , fStencilWrapOps(stencilWrapOpsSupport)
-    , fSubpathCount(0)
-    , fSubpathVertCount(0)
-    , fPreviousSrcTol(-GR_Scalar1)
-    , fPreviousStages(-1) {
-    fTarget = NULL;
-}
-
-bool GrDefaultPathRenderer::canDrawPath(const GrDrawTarget::Caps& targetCaps,
-                                        const SkPath& path,
-                                        GrPathFill fill,
-                                        bool antiAlias) const {
-    // this class can draw any path with any fill but doesn't do any 
-    // anti-aliasing.
-    return !antiAlias; 
+    , fStencilWrapOps(stencilWrapOpsSupport) {
 }
 
 
@@ -162,49 +148,23 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Helpers for drawPath
 
-static GrConvexHint getConvexHint(const SkPath& path) {
-    return path.isConvex() ? kConvex_ConvexHint : kConcave_ConvexHint;
-}
-
 #define STENCIL_OFF     0   // Always disable stencil (even when needed)
 
-static inline bool single_pass_path(const GrDrawTarget& target,
-                                    const GrPath& path,
-                                    GrPathFill fill) {
+static inline bool single_pass_path(const GrPath& path, GrPathFill fill) {
 #if STENCIL_OFF
     return true;
 #else
-    if (kEvenOdd_PathFill == fill) {
-        GrConvexHint hint = getConvexHint(path);
-        return hint == kConvex_ConvexHint ||
-               hint == kNonOverlappingConvexPieces_ConvexHint;
-    } else if (kWinding_PathFill == fill) {
-        GrConvexHint hint = getConvexHint(path);
-        return hint == kConvex_ConvexHint ||
-               hint == kNonOverlappingConvexPieces_ConvexHint ||
-               (hint == kSameWindingConvexPieces_ConvexHint &&
-                !target.drawWillReadDst() &&
-                !target.getDrawState().isDitherState());
-
+    if (kEvenOdd_PathFill == fill || kWinding_PathFill == fill) {
+        return path.isConvex();
     }
     return false;
 #endif
 }
 
-bool GrDefaultPathRenderer::requiresStencilPass(const GrDrawTarget* target,
-                                                const GrPath& path,
-                                                GrPathFill fill) const {
-    return !single_pass_path(*target, path, fill);
-}
-
-void GrDefaultPathRenderer::pathWillClear() {
-    fSubpathVertCount.reset(0);
-    fTarget->resetVertexSource();
-    if (fUseIndexedDraw) {
-        fTarget->resetIndexSource();
-    }
-    fPreviousSrcTol = -GR_Scalar1;
-    fPreviousStages = -1;
+bool GrDefaultPathRenderer::requiresStencilPass(const SkPath& path,
+                                                GrPathFill fill,
+                                                const GrDrawTarget* target) const {
+    return !single_pass_path(path, fill);
 }
 
 static inline void append_countour_edge_indices(GrPathFill fillType,
@@ -221,13 +181,21 @@
     *((*indices)++) = edgeV0Idx + 1;
 }
 
-bool GrDefaultPathRenderer::createGeom(GrScalar srcSpaceTol,
-                                       GrDrawState::StageMask stageMask) {
+bool GrDefaultPathRenderer::createGeom(const SkPath& path,
+                                       GrPathFill fill,
+                                       const GrVec* translate,
+                                       GrScalar srcSpaceTol,
+                                       GrDrawTarget* target,
+                                       GrDrawState::StageMask stageMask,
+                                       GrPrimitiveType* primType,
+                                       int* vertexCnt,
+                                       int* indexCnt) {
     {
     SK_TRACE_EVENT0("GrDefaultPathRenderer::createGeom");
 
     GrScalar srcSpaceTolSqd = GrMul(srcSpaceTol, srcSpaceTol);
-    int maxPts = GrPathUtils::worstCasePointCount(*fPath, &fSubpathCount,
+    int contourCnt;
+    int maxPts = GrPathUtils::worstCasePointCount(path, &contourCnt,
                                                   srcSpaceTol);
 
     if (maxPts <= 0) {
@@ -245,27 +213,27 @@
         }
     }
 
-    fUseIndexedDraw = fSubpathCount > 1;
+    bool indexed = contourCnt > 1;
 
     int maxIdxs = 0;
-    if (kHairLine_PathFill == fFill) {
-        if (fUseIndexedDraw) {
+    if (kHairLine_PathFill == fill) {
+        if (indexed) {
             maxIdxs = 2 * maxPts;
-            fPrimitiveType = kLines_PrimitiveType;
+            *primType = kLines_PrimitiveType;
         } else {
-            fPrimitiveType = kLineStrip_PrimitiveType;
+            *primType = kLineStrip_PrimitiveType;
         }
     } else {
-        if (fUseIndexedDraw) {
+        if (indexed) {
             maxIdxs = 3 * maxPts;
-            fPrimitiveType = kTriangles_PrimitiveType;
+            *primType = kTriangles_PrimitiveType;
         } else {
-            fPrimitiveType = kTriangleFan_PrimitiveType;
+            *primType = kTriangleFan_PrimitiveType;
         }
     }
 
     GrPoint* base;
-    if (!fTarget->reserveVertexSpace(layout, maxPts, (void**)&base)) {
+    if (!target->reserveVertexSpace(layout, maxPts, (void**)&base)) {
         return false;
     }
     GrAssert(NULL != base);
@@ -274,23 +242,21 @@
     uint16_t* idxBase = NULL;
     uint16_t* idx = NULL;
     uint16_t subpathIdxStart = 0;
-    if (fUseIndexedDraw) {
-        if (!fTarget->reserveIndexSpace(maxIdxs, (void**)&idxBase)) {
-            fTarget->resetVertexSource();
+    if (indexed) {
+        if (!target->reserveIndexSpace(maxIdxs, (void**)&idxBase)) {
+            target->resetVertexSource();
             return false;
         }
         GrAssert(NULL != idxBase);
         idx = idxBase;
     }
 
-    fSubpathVertCount.reset(fSubpathCount);
-
     GrPoint pts[4];
 
     bool first = true;
     int subpath = 0;
 
-    SkPath::Iter iter(*fPath, false);
+    SkPath::Iter iter(path, false);
 
     for (;;) {
         GrPathCmd cmd = (GrPathCmd)iter.next(pts);
@@ -298,7 +264,6 @@
             case kMove_PathCmd:
                 if (!first) {
                     uint16_t currIdx = (uint16_t) (vert - base);
-                    fSubpathVertCount[subpath] = currIdx - subpathIdxStart;
                     subpathIdxStart = currIdx;
                     ++subpath;
                 }
@@ -306,9 +271,9 @@
                 vert++;
                 break;
             case kLine_PathCmd:
-                if (fUseIndexedDraw) {
+                if (indexed) {
                     uint16_t prevIdx = (uint16_t)(vert - base) - 1;
-                    append_countour_edge_indices(fFill, subpathIdxStart,
+                    append_countour_edge_indices(fill, subpathIdxStart,
                                                  prevIdx, &idx);
                 }
                 *(vert++) = pts[1];
@@ -321,9 +286,9 @@
                             pts[0], pts[1], pts[2],
                             srcSpaceTolSqd, &vert,
                             GrPathUtils::quadraticPointCount(pts, srcSpaceTol));
-                if (fUseIndexedDraw) {
+                if (indexed) {
                     for (uint16_t i = 0; i < numPts; ++i) {
-                        append_countour_edge_indices(fFill, subpathIdxStart,
+                        append_countour_edge_indices(fill, subpathIdxStart,
                                                      firstQPtIdx + i, &idx);
                     }
                 }
@@ -336,9 +301,9 @@
                                 pts[0], pts[1], pts[2], pts[3],
                                 srcSpaceTolSqd, &vert,
                                 GrPathUtils::cubicPointCount(pts, srcSpaceTol));
-                if (fUseIndexedDraw) {
+                if (indexed) {
                     for (uint16_t i = 0; i < numPts; ++i) {
-                        append_countour_edge_indices(fFill, subpathIdxStart,
+                        append_countour_edge_indices(fill, subpathIdxStart,
                                                      firstCPtIdx + i, &idx);
                     }
                 }
@@ -348,7 +313,6 @@
                 break;
             case kEnd_PathCmd:
                 uint16_t currIdx = (uint16_t) (vert - base);
-                fSubpathVertCount[subpath] = currIdx - subpathIdxStart;
                 goto FINISHED;
         }
         first = false;
@@ -357,49 +321,49 @@
     GrAssert((vert - base) <= maxPts);
     GrAssert((idx - idxBase) <= maxIdxs);
 
-    fVertexCnt = vert - base;
-    fIndexCnt = idx - idxBase;
+    *vertexCnt = vert - base;
+    *indexCnt = idx - idxBase;
 
-    if (fTranslate.fX || fTranslate.fY) {
+    if (NULL != translate && 
+        (translate->fX || translate->fY)) {
         int count = vert - base;
         for (int i = 0; i < count; i++) {
-            base[i].offset(fTranslate.fX, fTranslate.fY);
+            base[i].offset(translate->fX, translate->fY);
         }
     }
     }
-    // set these at the end so if we failed on first drawPath inside a
-    // setPath/clearPath block we won't assume geom was created on a subsequent
-    // drawPath in the same block.
-    fPreviousSrcTol = srcSpaceTol;
-    fPreviousStages = stageMask;
     return true;
 }
 
-void GrDefaultPathRenderer::onDrawPath(GrDrawState::StageMask stageMask,
-                                       bool stencilOnly) {
+bool GrDefaultPathRenderer::internalDrawPath(const SkPath& path,
+                                             GrPathFill fill,
+                                             const GrVec* translate,
+                                             GrDrawTarget* target,
+                                             GrDrawState::StageMask stageMask,
+                                             bool stencilOnly) {
 
-    GrMatrix viewM = fTarget->getDrawState().getViewMatrix();
+    GrMatrix viewM = target->getDrawState().getViewMatrix();
     GrScalar tol = GR_Scalar1;
-    tol = GrPathUtils::scaleToleranceToSrc(tol, viewM, fPath->getBounds());
-    GrDrawState* drawState = fTarget->drawState();
+    tol = GrPathUtils::scaleToleranceToSrc(tol, viewM, path.getBounds());
+    GrDrawState* drawState = target->drawState();
 
-    // FIXME: It's really dumb that we recreate the verts for a new vertex
-    // layout. We only do that because the GrDrawTarget API doesn't allow
-    // us to change the vertex layout after reserveVertexSpace(). We won't
-    // actually change the vertex data when the layout changes since all the
-    // stages reference the positions (rather than having separate tex coords)
-    // and we don't ever have per-vert colors. In practice our call sites
-    // won't change the stages in use inside a setPath / removePath pair. But
-    // it is a silly limitation of the GrDrawTarget design that should be fixed.
-    if (tol != fPreviousSrcTol ||
-        stageMask != fPreviousStages) {
-        if (!this->createGeom(tol, stageMask)) {
-            return;
-        }
+    int vertexCnt;
+    int indexCnt;
+    GrPrimitiveType primType;
+    if (!this->createGeom(path,
+                          fill,
+                          translate,
+                          tol,
+                          target,
+                          stageMask,
+                          &primType,
+                          &vertexCnt,
+                          &indexCnt)) {
+        return false;
     }
 
-    GrAssert(NULL != fTarget);
-    GrDrawTarget::AutoStateRestore asr(fTarget);
+    GrAssert(NULL != target);
+    GrDrawTarget::AutoStateRestore asr(target);
     bool colorWritesWereDisabled = drawState->isColorWriteDisabled();
     // face culling doesn't make sense here
     GrAssert(GrDrawState::kBoth_DrawFace == drawState->getDrawFace());
@@ -410,7 +374,7 @@
     bool                        reverse = false;
     bool                        lastPassIsBounds;
 
-    if (kHairLine_PathFill == fFill) {
+    if (kHairLine_PathFill == fill) {
         passCount = 1;
         if (stencilOnly) {
             passes[0] = &gDirectToStencil;
@@ -420,7 +384,7 @@
         lastPassIsBounds = false;
         drawFace[0] = GrDrawState::kBoth_DrawFace;
     } else {
-        if (single_pass_path(*fTarget, *fPath, fFill)) {
+        if (single_pass_path(path, fill)) {
             passCount = 1;
             if (stencilOnly) {
                 passes[0] = &gDirectToStencil;
@@ -430,7 +394,7 @@
             drawFace[0] = GrDrawState::kBoth_DrawFace;
             lastPassIsBounds = false;
         } else {
-            switch (fFill) {
+            switch (fill) {
                 case kInverseEvenOdd_PathFill:
                     reverse = true;
                     // fallthrough
@@ -491,7 +455,7 @@
                     break;
                 default:
                     GrAssert(!"Unknown path fFill!");
-                    return;
+                    return false;
             }
         }
     }
@@ -523,44 +487,63 @@
                     if (stageMask) {
                         if (!drawState->getViewInverse(&vmi)) {
                             GrPrintf("Could not invert matrix.");
-                            return;
+                            return false;
                         }
                         drawState->preConcatSamplerMatrices(stageMask, vmi);
                     }
                     drawState->setViewMatrix(GrMatrix::I());
                 }
             } else {
-                bounds = fPath->getBounds();
-                bounds.offset(fTranslate);
+                bounds = path.getBounds();
+                if (NULL != translate) {
+                    bounds.offset(*translate);
+                }
             }
-            GrDrawTarget::AutoGeometryPush agp(fTarget);
-            fTarget->drawSimpleRect(bounds, NULL, stageMask);
+            GrDrawTarget::AutoGeometryPush agp(target);
+            target->drawSimpleRect(bounds, NULL, stageMask);
         } else {
             if (passCount > 1) {
                 drawState->enableState(GrDrawState::kNoColorWrites_StateBit);
             }
-            if (fUseIndexedDraw) {
-                fTarget->drawIndexed(fPrimitiveType, 0, 0, 
-                                     fVertexCnt, fIndexCnt);
+            if (indexCnt) {
+                target->drawIndexed(primType, 0, 0, 
+                                    vertexCnt, indexCnt);
             } else {
-                int baseVertex = 0;
-                for (int sp = 0; sp < fSubpathCount; ++sp) {
-                    fTarget->drawNonIndexed(fPrimitiveType, baseVertex,
-                                            fSubpathVertCount[sp]);
-                    baseVertex += fSubpathVertCount[sp];
-                }
+                target->drawNonIndexed(primType, 0, vertexCnt);
             }
         }
     }
     }
+    return true;
 }
 
-void GrDefaultPathRenderer::drawPath(GrDrawState::StageMask stageMask) {
-    this->onDrawPath(stageMask, false);
+bool GrDefaultPathRenderer::canDrawPath(const SkPath& path,
+                                        GrPathFill fill,
+                                        const GrDrawTarget* target,
+                                        bool antiAlias) const {
+    // this class can draw any path with any fill but doesn't do any 
+    // anti-aliasing.
+    return !antiAlias;
 }
 
-void GrDefaultPathRenderer::drawPathToStencil() {
-    GrAssert(kInverseEvenOdd_PathFill != fFill);
-    GrAssert(kInverseWinding_PathFill != fFill);
-    this->onDrawPath(0, true);
+bool GrDefaultPathRenderer::onDrawPath(const SkPath& path,
+                                       GrPathFill fill,
+                                       const GrVec* translate,
+                                       GrDrawTarget* target,
+                                       GrDrawState::StageMask stageMask,
+                                       bool antiAlias) {
+    return this->internalDrawPath(path,
+                                  fill,
+                                  translate,
+                                  target,
+                                  stageMask,
+                                  false);
+}
+
+void GrDefaultPathRenderer::drawPathToStencil(const SkPath& path,
+                                              GrPathFill fill,
+                                              GrDrawTarget* target) {
+    GrAssert(kInverseEvenOdd_PathFill != fill);
+    GrAssert(kInverseWinding_PathFill != fill);
+    this->internalDrawPath(path, fill, NULL, target, 0, true);
 }
diff --git a/src/gpu/GrDefaultPathRenderer.h b/src/gpu/GrDefaultPathRenderer.h
index adfe7d2..a6a4cda 100644
--- a/src/gpu/GrDefaultPathRenderer.h
+++ b/src/gpu/GrDefaultPathRenderer.h
@@ -20,40 +20,49 @@
     GrDefaultPathRenderer(bool separateStencilSupport,
                           bool stencilWrapOpsSupport);
 
-    virtual bool canDrawPath(const GrDrawTarget::Caps& targetCaps,
-                             const SkPath& path,
-                             GrPathFill fill,
-                             bool antiAlias) const SK_OVERRIDE;
 
-    virtual bool requiresStencilPass(const GrDrawTarget* target,
-                                     const SkPath& path,
-                                     GrPathFill fill) const SK_OVERRIDE;
+    virtual bool requiresStencilPass(const SkPath& path,
+                                     GrPathFill fill,
+                                     const GrDrawTarget* target) const SK_OVERRIDE;
 
-    virtual void drawPath(GrDrawState::StageMask stageMask) SK_OVERRIDE;
-    virtual void drawPathToStencil() SK_OVERRIDE;
+    virtual bool canDrawPath(const SkPath& path,
+                            GrPathFill fill,
+                            const GrDrawTarget* target,
+                            bool antiAlias) const SK_OVERRIDE;
 
-protected:
-    virtual void pathWillClear();
+    virtual void drawPathToStencil(const SkPath& path,
+                                   GrPathFill fill,
+                                   GrDrawTarget* target) SK_OVERRIDE;
 
 private:
 
-    void onDrawPath(GrDrawState::StageMask stages, bool stencilOnly);
+    virtual bool onDrawPath(const SkPath& path,
+                            GrPathFill fill,
+                            const GrVec* translate,
+                            GrDrawTarget* target,
+                            GrDrawState::StageMask stageMask,
+                            bool antiAlias) SK_OVERRIDE;
 
-    bool createGeom(GrScalar srcSpaceTol,
-                   GrDrawState::StageMask stages);
+    bool internalDrawPath(const SkPath& path,
+                          GrPathFill fill,
+                          const GrVec* translate,
+                          GrDrawTarget* target,
+                          GrDrawState::StageMask stageMask,
+                          bool stencilOnly);
+
+    bool createGeom(const SkPath& path,
+                    GrPathFill fill,
+                    const GrVec* translate,
+                    GrScalar srcSpaceTol,
+                    GrDrawTarget* target,
+                    GrDrawState::StageMask stages,
+                    GrPrimitiveType* primType,
+                    int* vertexCnt,
+                    int* indexCnt);
 
     bool    fSeparateStencil;
     bool    fStencilWrapOps;
 
-    int                         fSubpathCount;
-    SkAutoSTMalloc<8, uint16_t> fSubpathVertCount;
-    int                         fIndexCnt;
-    int                         fVertexCnt;
-    GrScalar                    fPreviousSrcTol;
-    GrDrawState::StageMask      fPreviousStages;
-    GrPrimitiveType             fPrimitiveType;
-    bool                        fUseIndexedDraw;
-
     typedef GrPathRenderer INHERITED;
 };
 
diff --git a/src/gpu/GrDrawState.h b/src/gpu/GrDrawState.h
index 4191c4a..18dd6bf 100644
--- a/src/gpu/GrDrawState.h
+++ b/src/gpu/GrDrawState.h
@@ -47,21 +47,49 @@
     GR_STATIC_ASSERT(sizeof(StageMask)*8 >= GrDrawState::kNumStages);
 
     GrDrawState() {
-        // make sure any pad is zero for memcmp
-        // all GrDrawState members should default to something
-        // valid by the memset
-        memset(this, 0, sizeof(GrDrawState));
-            
-        // memset exceptions
-        fColorFilterMode = SkXfermode::kDstIn_Mode;
-        fFirstCoverageStage = kNumStages;
+        this->reset();
+    }
 
+    GrDrawState(const GrDrawState& state) {
+        *this = state;
+    }
+
+    /**
+     * Resets to the default state. Sampler states will not be modified.
+     */ 
+    void reset() {
+        // make sure any pad is zero for memcmp
+        // all GrDrawState members should default to something valid by the
+        // the memset except those initialized individually below. There should
+        // be no padding between the individually initialized members.
+        static const size_t kMemsetSize =
+            reinterpret_cast<intptr_t>(&fColor) -
+            reinterpret_cast<intptr_t>(this);
+        memset(this, 0, kMemsetSize);
         // pedantic assertion that our ptrs will
         // be NULL (0 ptr is mem addr 0)
         GrAssert((intptr_t)(void*)NULL == 0LL);
-
+        GR_STATIC_ASSERT(0 == kBoth_DrawFace);
         GrAssert(fStencilSettings.isDisabled());
+
+        // memset exceptions
+        fColor = 0xffffffff;
+        fCoverage = 0xffffffff;
         fFirstCoverageStage = kNumStages;
+        fColorFilterMode = SkXfermode::kDst_Mode;
+        fSrcBlend = kOne_BlendCoeff;
+        fDstBlend = kZero_BlendCoeff;
+        fViewMatrix.reset();
+
+        // ensure values that will be memcmp'ed in == but not memset in reset()
+        // are tightly packed
+        GrAssert(kMemsetSize +  sizeof(fColor) + sizeof(fCoverage) +
+                 sizeof(fFirstCoverageStage) + sizeof(fColorFilterMode) +
+                 sizeof(fSrcBlend) + sizeof(fDstBlend) + sizeof(GrMatrix) ==
+                 reinterpret_cast<intptr_t>(&fEdgeAANumEdges) -
+                 reinterpret_cast<intptr_t>(this));
+
+        fEdgeAANumEdges = 0;
     }
 
     ///////////////////////////////////////////////////////////////////////////
@@ -102,6 +130,33 @@
     /// @}
 
     ///////////////////////////////////////////////////////////////////////////
+    /// @name Coverage
+    ////
+
+    /**
+     * Sets a constant fractional coverage to be applied to the draw. The 
+     * initial value (after construction or reset()) is 0xff. The constant
+     * coverage is ignored when per-vertex coverage is provided.
+     */
+    void setCoverage(uint8_t coverage) {
+        fCoverage = GrColorPackRGBA(coverage, coverage, coverage, coverage);
+    }
+
+    /**
+     * Version of above that specifies 4 channel per-vertex color. The value
+     * should be premultiplied.
+     */
+    void setCoverage4(GrColor coverage) {
+        fCoverage = coverage;
+    }
+
+    GrColor getCoverage() const {
+        return fCoverage;
+    }
+
+    /// @}
+
+    ///////////////////////////////////////////////////////////////////////////
     /// @name Textures
     ////
 
@@ -489,9 +544,15 @@
         /* 1-pixel wide line
            2D implicit line eq (a*x + b*y +c = 0). 4th component unused */
         kHairLine_EdgeType,
-        /* 1-pixel wide quadratic
-           u^2-v canonical coords (only 2 components used) */
-        kHairQuad_EdgeType
+        /* Quadratic specified by u^2-v canonical coords (only 2 
+           components used). Coverage based on signed distance with negative
+           being inside, positive outside.*/
+        kQuad_EdgeType,
+        /* Same as above but for hairline quadratics. Uses unsigned distance.
+           Coverage is min(0, 1-distance). */
+        kHairQuad_EdgeType,
+
+        kVertexEdgeTypeCnt
     };
 
     /**
@@ -500,12 +561,11 @@
      * are not specified the value of this setting has no effect.
      */
     void setVertexEdgeType(VertexEdgeType type) {
+        GrAssert(type >=0 && type < kVertexEdgeTypeCnt);
         fVertexEdgeType = type;
     }
 
-    VertexEdgeType getVertexEdgeType() const {
-        return fVertexEdgeType;
-    }
+    VertexEdgeType getVertexEdgeType() const { return fVertexEdgeType; }
 
     /**
      * The absolute maximum number of edges that may be specified for
@@ -671,9 +731,7 @@
      * or both faces.
      * @return the current draw face(s).
      */
-    DrawFace getDrawFace() const {
-        return fDrawFace;
-    }
+    DrawFace getDrawFace() const { return fDrawFace; }
     
     /// @}
 
@@ -713,32 +771,38 @@
 
 private:
     static const StageMask kIllegalStageMaskBits = ~((1 << kNumStages)-1);
-    uint8_t                 fFlagBits;
-    GrBlendCoeff            fSrcBlend : 8;
-    GrBlendCoeff            fDstBlend : 8;
-    DrawFace                fDrawFace : 8;
-    uint8_t                 fFirstCoverageStage;
-    SkXfermode::Mode        fColorFilterMode : 8;
-    GrColor                 fBlendConstant;
-    GrTexture*              fTextures[kNumStages];
-    GrRenderTarget*         fRenderTarget;
-    GrColor                 fColor;
-    GrColor                 fColorFilterColor;
-    float                   fColorMatrix[20];
-    GrStencilSettings       fStencilSettings;
-    GrMatrix                fViewMatrix;
+    // @{ these fields can be initialized with memset to 0
+    GrColor             fBlendConstant;
+    GrTexture*          fTextures[kNumStages];
+    GrColor             fColorFilterColor;
+    uint32_t            fFlagBits;
+    DrawFace            fDrawFace; 
+    VertexEdgeType      fVertexEdgeType;
+    GrStencilSettings   fStencilSettings;
+    float               fColorMatrix[20];       // 5 x 4 matrix
+    GrRenderTarget*     fRenderTarget;
+    // @}
+
+    // @{ Initialized to values other than zero
+    GrColor             fColor;
+    GrColor             fCoverage;
+    int                 fFirstCoverageStage;
+    SkXfermode::Mode    fColorFilterMode;
+    GrBlendCoeff        fSrcBlend;
+    GrBlendCoeff        fDstBlend;
+    GrMatrix            fViewMatrix;
+    // @}
+
     // @{ Data for GrTesselatedPathRenderer
     // TODO: currently ignored in copying & comparison for performance.
     // Must be considered if GrTesselatedPathRenderer is being used.
-
-    VertexEdgeType          fVertexEdgeType;
-    int                     fEdgeAANumEdges;
-    Edge                    fEdgeAAEdges[kMaxEdges];
-
+    int                 fEdgeAANumEdges;
+    Edge                fEdgeAAEdges[kMaxEdges];
     // @}
+
     // This field must be last; it will not be copied or compared
     // if the corresponding fTexture[] is NULL.
-    GrSamplerState          fSamplerStates[kNumStages];
+    GrSamplerState      fSamplerStates[kNumStages];
 
     size_t leadingBytes() const {
         // Can't use offsetof() with non-POD types, so stuck with pointer math.
@@ -746,7 +810,7 @@
         // have a compile-time flag that lets us know if it's being used, and
         // checking at runtime seems to cost 5% performance.
         return (size_t) ((unsigned char*)&fEdgeAANumEdges -
-                         (unsigned char*)&fFlagBits);
+                         (unsigned char*)&fBlendConstant);
     }
 
 };
diff --git a/src/gpu/GrDrawTarget.cpp b/src/gpu/GrDrawTarget.cpp
index be6bd0a..d68aa39 100644
--- a/src/gpu/GrDrawTarget.cpp
+++ b/src/gpu/GrDrawTarget.cpp
@@ -512,11 +512,11 @@
 }
 
 void GrDrawTarget::saveCurrentDrawState(SavedDrawState* state) const {
-    state->fState = fCurrDrawState;
+    state->fState.set(fCurrDrawState);
 }
 
 void GrDrawTarget::restoreDrawState(const SavedDrawState& state) {
-    fCurrDrawState = state.fState;
+    fCurrDrawState = *state.fState.get();
 }
 
 void GrDrawTarget::copyDrawState(const GrDrawTarget& srcTarget) {
@@ -885,8 +885,10 @@
 
     // When coeffs are (0,1) there is no reason to draw at all, unless
     // stenciling is enabled. Having color writes disabled is effectively
-    // (0,1).
-    if ((kZero_BlendCoeff == *srcCoeff && dstCoeffIsOne)) {
+    // (0,1). The same applies when coverage is known to be 0.
+    if ((kZero_BlendCoeff == *srcCoeff && dstCoeffIsOne) ||
+        (!(layout & kCoverage_VertexLayoutBit) && 
+         0 == drawState.getCoverage())) {
         if (drawState.getStencil().doesWrite()) {
             return kDisableBlend_BlendOptFlag |
                    kEmitTransBlack_BlendOptFlag;
@@ -895,8 +897,10 @@
         }
     }
 
-    // check for coverage due to edge aa or coverage texture stage
+    // check for coverage due to constant coverage, per-vertex coverage,
+    // edge aa or coverage texture stage
     bool hasCoverage = forceCoverage ||
+                       0xffffffff != drawState.getCoverage() || 
                        drawState.getNumAAEdges() > 0 ||
                        (layout & kCoverage_VertexLayoutBit) ||
                        (layout & kEdge_VertexLayoutBit);
diff --git a/src/gpu/GrDrawTarget.h b/src/gpu/GrDrawTarget.h
index d52565d..6962a18 100644
--- a/src/gpu/GrDrawTarget.h
+++ b/src/gpu/GrDrawTarget.h
@@ -21,6 +21,7 @@
 #include "GrTexture.h"
 
 #include "SkXfermode.h"
+#include "SkTLazy.h"
 
 class GrTexture;
 class GrClipIterator;
@@ -50,7 +51,6 @@
         bool fFSAASupport               : 1;
         bool fDualSourceBlendingSupport : 1;
         bool fBufferLockSupport         : 1;
-        bool fSupportPerVertexCoverage  : 1;
         int fMaxRenderTargetSize;
         int fMaxTextureSize;
     };
@@ -140,7 +140,7 @@
      */
     struct SavedDrawState {
     private:
-        GrDrawState fState;
+        SkTLazy<GrDrawState> fState;
         friend class GrDrawTarget;
     };
 
@@ -240,8 +240,7 @@
     enum VertexLayoutBits {
         /* vertices have colors (GrColor) */
         kColor_VertexLayoutBit              = 1 << (STAGE_BIT_CNT + 0),
-        /* vertices have coverage (GrColor where all channels should have the 
-         * same value)
+        /* vertices have coverage (GrColor)
          */
         kCoverage_VertexLayoutBit           = 1 << (STAGE_BIT_CNT + 1),
         /* Use text vertices. (Pos and tex coords may be a different type for
@@ -928,10 +927,12 @@
 
     // Helpers for GrDrawTarget subclasses that won't have private access to
     // SavedDrawState but need to peek at the state values.
-    static GrDrawState& accessSavedDrawState(SavedDrawState& sds)
-                                                        { return sds.fState; }
-    static const GrDrawState& accessSavedDrawState(const SavedDrawState& sds)
-                                                        { return sds.fState; }
+    static GrDrawState& accessSavedDrawState(SavedDrawState& sds) {
+        return *sds.fState.get();
+    }
+    static const GrDrawState& accessSavedDrawState(const SavedDrawState& sds){
+        return *sds.fState.get();
+    }
 
     // implemented by subclass to allocate space for reserved geom
     virtual bool onReserveVertexSpace(GrVertexLayout vertexLayout,
diff --git a/src/gpu/GrGLSL.cpp b/src/gpu/GrGLSL.cpp
deleted file mode 100644
index 1062c81..0000000
--- a/src/gpu/GrGLSL.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright 2011 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "GrGLSL.h"
-
-GrGLSLGeneration GetGLSLGeneration(GrGLBinding binding,
-                                   const GrGLInterface* gl) {
-    GrGLSLVersion ver = GrGLGetGLSLVersion(gl);
-    switch (binding) {
-        case kDesktop_GrGLBinding:
-            GrAssert(ver >= GR_GLSL_VER(1,10));
-            if (ver >= GR_GLSL_VER(1,50)) {
-                return k150_GLSLGeneration;
-            } else if (ver >= GR_GLSL_VER(1,30)) {
-                return k130_GLSLGeneration;
-            } else {
-                return k110_GLSLGeneration;
-            }
-        case kES2_GrGLBinding:
-            // version 1.00 of ES GLSL based on ver 1.20 of desktop GLSL
-            GrAssert(ver >= GR_GL_VER(1,00));
-            return k110_GLSLGeneration;
-        default:
-            GrCrash("Unknown GL Binding");
-            return k110_GLSLGeneration; // suppress warning
-    }
-}
-
diff --git a/src/gpu/GrGLSL.h b/src/gpu/GrGLSL.h
deleted file mode 100644
index 501d1ba..0000000
--- a/src/gpu/GrGLSL.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 2011 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#ifndef GrGLSL_DEFINED
-#define GrGLSL_DEFINED
-
-#include "GrGLInterface.h"
-
-// Limited set of GLSL versions we build shaders for. Caller should round
-// down the GLSL version to one of these enums.
-enum GrGLSLGeneration {
-    /**
-     * Desktop GLSL 1.10 and ES2 shading lang (based on desktop GLSL 1.20)
-     */
-    k110_GLSLGeneration,
-    /**
-     * Desktop GLSL 1.30
-     */
-    k130_GLSLGeneration,
-    /**
-     * Dekstop GLSL 1.50
-     */
-    k150_GLSLGeneration,
-};
-
-GrGLSLGeneration GetGLSLGeneration(GrGLBinding binding,
-                                   const GrGLInterface* gl);
-
-#endif
-
diff --git a/src/gpu/GrGpu.cpp b/src/gpu/GrGpu.cpp
index 8a7d3e4..32338ca 100644
--- a/src/gpu/GrGpu.cpp
+++ b/src/gpu/GrGpu.cpp
@@ -14,7 +14,7 @@
 #include "GrContext.h"
 #include "GrIndexBuffer.h"
 #include "GrPathRenderer.h"
-#include "GrGLStencilBuffer.h"
+#include "GrStencilBuffer.h"
 #include "GrVertexBuffer.h"
 
 // probably makes no sense for this to be less than a page
@@ -215,11 +215,6 @@
     return this->onCreatePlatformRenderTarget(desc);
 }
 
-GrResource* GrGpu::createPlatformSurface(const GrPlatformSurfaceDesc& desc) {
-    this->handleDirtyContext();
-    return this->onCreatePlatformSurface(desc);
-}
-
 GrVertexBuffer* GrGpu::createVertexBuffer(uint32_t size, bool dynamic) {
     this->handleDirtyContext();
     return this->onCreateVertexBuffer(size, dynamic);
@@ -265,6 +260,13 @@
                                config, buffer, rowBytes);
 }
 
+void GrGpu::resolveRenderTarget(GrRenderTarget* target) {
+    GrAssert(target);
+    this->handleDirtyContext();
+    this->onResolveRenderTarget(target);
+}
+
+
 ////////////////////////////////////////////////////////////////////////////////
 
 static const int MAX_QUADS = 1 << 12; // max possible: (1 << 14) - 1;
@@ -341,15 +343,17 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 
-// stencil settings to use when clip is in stencil
-GR_STATIC_CONST_SAME_STENCIL(gClipStencilSettings,
-    kKeep_StencilOp,
-    kKeep_StencilOp,
-    kAlwaysIfInClip_StencilFunc,
-    0x0000,
-    0x0000,
-    0x0000);
-const GrStencilSettings& GrGpu::gClipStencilSettings = ::gClipStencilSettings;
+const GrStencilSettings* GrGpu::GetClipStencilSettings(void) {
+    // stencil settings to use when clip is in stencil
+    GR_STATIC_CONST_SAME_STENCIL_STRUCT(sClipStencilSettings,
+        kKeep_StencilOp,
+        kKeep_StencilOp,
+        kAlwaysIfInClip_StencilFunc,
+        0x0000,
+        0x0000,
+        0x0000);
+    return GR_CONST_STENCIL_SETTINGS_PTR_FROM_STRUCT_PTR(&sClipStencilSettings);
+}
 
 // mapping of clip-respecting stencil funcs to normal stencil funcs
 // mapping depends on whether stencil-clipping is in effect.
@@ -439,7 +443,7 @@
 #if VISUALIZE_COMPLEX_CLIP
     #include "GrRandom.h"
     GrRandom gRandom;
-    #define SET_RANDOM_COLOR this->setColor(0xff000000 | gRandom.nextU());
+    #define SET_RANDOM_COLOR drawState->setColor(0xff000000 | gRandom.nextU());
 #else
     #define SET_RANDOM_COLOR
 #endif
@@ -628,7 +632,6 @@
 
                 GrPathRenderer* pr = NULL;
                 const GrPath* clipPath = NULL;
-                GrPathRenderer::AutoClearPath arp;
                 if (kRect_ClipType == clip.getElementType(c)) {
                     canRenderDirectToStencil = true;
                     fill = kEvenOdd_PathFill;
@@ -651,8 +654,7 @@
                         return false;
                     }
                     canRenderDirectToStencil =
-                        !pr->requiresStencilPass(this, *clipPath, fill);
-                    arp.set(pr, this, clipPath, fill, false, NULL);
+                        !pr->requiresStencilPass(*clipPath, fill, this);
                 }
 
                 GrSetOp op = (c == start) ? startOp : clip.getOp(c);
@@ -686,9 +688,9 @@
                     } else {
                         if (canRenderDirectToStencil) {
                             *drawState->stencil() = gDrawToStencil;
-                            pr->drawPath(0);
+                            pr->drawPath(*clipPath, fill, NULL, this, 0, false);
                         } else {
-                            pr->drawPathToStencil();
+                            pr->drawPathToStencil(*clipPath, fill, this);
                         }
                     }
                 }
@@ -704,7 +706,7 @@
                             this->drawSimpleRect(clip.getRect(c), NULL, 0);
                         } else {
                             SET_RANDOM_COLOR
-                            pr->drawPath(0);
+                            pr->drawPath(*clipPath, fill, NULL, this, 0, false);
                         }
                     } else {
                         SET_RANDOM_COLOR
@@ -735,8 +737,7 @@
             new GrPathRendererChain(this->getContext(),
                                     GrPathRendererChain::kNonAAOnly_UsageFlag);
     }
-    return fPathRendererChain->getPathRenderer(this->getCaps(),
-                                               path, fill, false);
+    return fPathRendererChain->getPathRenderer(path, fill, this, false);
 }
 
 
diff --git a/src/gpu/GrGpu.h b/src/gpu/GrGpu.h
index 0836ec8..52282ed 100644
--- a/src/gpu/GrGpu.h
+++ b/src/gpu/GrGpu.h
@@ -132,11 +132,6 @@
     GrRenderTarget* createPlatformRenderTarget(const GrPlatformRenderTargetDesc& desc);
 
     /**
-     * DEPRECATED. This will be removed.
-     */
-    GrResource* createPlatformSurface(const GrPlatformSurfaceDesc& desc);
-
-    /**
      * Creates a vertex buffer.
      *
      * @param size    size in bytes of the vertex buffer
@@ -177,6 +172,11 @@
     const GrVertexBuffer* getUnitSquareVertexBuffer() const;
 
     /**
+     * Resolves MSAA.
+     */
+    void resolveRenderTarget(GrRenderTarget* target);
+
+    /**
      * Ensures that the current render target is actually set in the
      * underlying 3D API. Used when client wants to use 3D API to directly
      * render to the RT.
@@ -184,6 +184,13 @@
     void forceRenderTargetFlush();
 
     /**
+     * If this returns true then a sequence that reads unpremultiplied pixels
+     * from a surface, writes back the same values, and reads them again will
+     * give the same pixel values back in both reads.
+     */
+    virtual bool canPreserveReadWriteUnpremulPixels() = 0;
+
+    /**
      * readPixels with some configs may be slow. Given a desired config this
      * function returns a fast-path config. The returned config must have the
      * same components, component sizes, and not require conversion between
@@ -355,7 +362,7 @@
 
     // stencil settings to clip drawing when stencil clipping is in effect
     // and the client isn't using the stencil test.
-    static const GrStencilSettings& gClipStencilSettings;
+    static const GrStencilSettings* GetClipStencilSettings();
 
     GrGpuStats fStats;
 
@@ -401,7 +408,6 @@
                                        size_t rowBytes) = 0;
     virtual GrTexture* onCreatePlatformTexture(const GrPlatformTextureDesc& desc) = 0;
     virtual GrRenderTarget* onCreatePlatformRenderTarget(const GrPlatformRenderTargetDesc& desc) = 0;
-    virtual GrResource* onCreatePlatformSurface(const GrPlatformSurfaceDesc& desc) = 0;
     virtual GrVertexBuffer* onCreateVertexBuffer(uint32_t size,
                                                  bool dynamic) = 0;
     virtual GrIndexBuffer* onCreateIndexBuffer(uint32_t size,
@@ -439,6 +445,9 @@
                                       GrPixelConfig config, const void* buffer,
                                       size_t rowBytes) = 0;
 
+    // overridden by API-specific derived class to perform the resolve
+    virtual void onResolveRenderTarget(GrRenderTarget* target) = 0;
+
     // called to program the vertex data, indexCount will be 0 if drawing non-
     // indexed geometry. The subclass may adjust the startVertex and/or
     // startIndex since it may have already accounted for these in the setup.
diff --git a/src/gpu/GrGpuFactory.cpp b/src/gpu/GrGpuFactory.cpp
index c954c7a..585f35f 100644
--- a/src/gpu/GrGpuFactory.cpp
+++ b/src/gpu/GrGpuFactory.cpp
@@ -14,10 +14,10 @@
 //    #include "GrGpuD3D9.h"
 #endif
 
-#include "GrGLConfig.h"
+#include "gl/GrGLConfig.h"
 
 #include "GrGpu.h"
-#include "GrGpuGLShaders.h"
+#include "gl/GrGpuGLShaders.h"
 
 GrGpu* GrGpu::Create(GrEngine engine, GrPlatform3DContext context3D) {
 
@@ -39,15 +39,10 @@
 #endif
             return NULL;
         }
-        if (!glInterface->validate()) {
-#if GR_DEBUG
-            GrPrintf("Failed GL interface validation!\n");
-#endif
-            return NULL;
+        GrGLContextInfo ctxInfo(glInterface);
+        if (ctxInfo.isInitialized()) {
+            return new GrGpuGLShaders(ctxInfo);
         }
-
-        return new GrGpuGLShaders(glInterface);
-    } else {
-        return NULL;
     }
+    return NULL;
 }
diff --git a/src/gpu/GrGpuVertex.h b/src/gpu/GrGpuVertex.h
index 2abc2f4..d093e2d 100644
--- a/src/gpu/GrGpuVertex.h
+++ b/src/gpu/GrGpuVertex.h
@@ -11,7 +11,7 @@
 #ifndef GrGpuVertex_DEFINED
 #define GrGpuVertex_DEFINED
 
-#include "GrGLConfig.h"
+#include "gl/GrGLConfig.h"
 #include "GrPoint.h"
 
 #if GR_TEXT_SCALAR_IS_USHORT
diff --git a/src/gpu/GrInOrderDrawBuffer.cpp b/src/gpu/GrInOrderDrawBuffer.cpp
index cbe153c..1127927 100644
--- a/src/gpu/GrInOrderDrawBuffer.cpp
+++ b/src/gpu/GrInOrderDrawBuffer.cpp
@@ -246,7 +246,7 @@
         poolState.fUsedPoolIndexBytes = 
                             GrMax(poolState.fUsedPoolIndexBytes, indexBytes);
         draw.fIndexBuffer = poolState.fPoolIndexBuffer;
-        draw.fStartIndex += poolState.fPoolStartVertex;
+        draw.fStartIndex += poolState.fPoolStartIndex;
         break;
     }
     default:
diff --git a/src/gpu/GrInOrderDrawBuffer.h b/src/gpu/GrInOrderDrawBuffer.h
index f935be1..73746e9 100644
--- a/src/gpu/GrInOrderDrawBuffer.h
+++ b/src/gpu/GrInOrderDrawBuffer.h
@@ -151,8 +151,6 @@
         kGeoPoolStatePreAllocCnt = 4,
     };
 
-    const GrGpu*                    fGpu;
-
     GrSTAllocator<kDrawPreallocCnt, Draw>               fDraws;
     GrSTAllocator<kStatePreallocCnt, SavedDrawState>    fStates;
     GrSTAllocator<kClearPreallocCnt, Clear>             fClears;
diff --git a/src/gpu/GrPathRenderer.cpp b/src/gpu/GrPathRenderer.cpp
index a4f7d0c..31e06a6 100644
--- a/src/gpu/GrPathRenderer.cpp
+++ b/src/gpu/GrPathRenderer.cpp
@@ -8,36 +8,6 @@
 
 #include "GrPathRenderer.h"
 
-GrPathRenderer::GrPathRenderer()
-    : fPath(NULL)
-    , fTarget(NULL) {
+GrPathRenderer::GrPathRenderer() {
 }
 
-void GrPathRenderer::setPath(GrDrawTarget* target,
-                             const SkPath* path,
-                             GrPathFill fill,
-                             bool antiAlias,
-                             const GrPoint* translate) {
-    GrAssert(NULL == fPath);
-    GrAssert(NULL == fTarget);
-    GrAssert(NULL != target);
-
-    fTarget = target;
-    fPath = path;
-    fFill = fill;
-    fAntiAlias = antiAlias;
-    if (NULL != translate) {
-        fTranslate = *translate;
-    } else {
-        fTranslate.fX = fTranslate.fY = 0;
-    }
-    this->pathWasSet();
-}
-
-void GrPathRenderer::clearPath() {
-    this->pathWillClear();
-    fTarget->resetVertexSource();
-    fTarget->resetIndexSource();
-    fTarget = NULL;
-    fPath = NULL;
-}
diff --git a/src/gpu/GrPathRenderer.h b/src/gpu/GrPathRenderer.h
index e24b982..6ffcade 100644
--- a/src/gpu/GrPathRenderer.h
+++ b/src/gpu/GrPathRenderer.h
@@ -49,25 +49,7 @@
                                  GrPathRendererChain* prChain);
 
 
-    GrPathRenderer(void);
-    /**
-     * Returns true if this path renderer is able to render the path.
-     * Returning false allows the caller to fallback to another path renderer.
-     * When searching for a path renderer capable of rendering a path this
-     * function is called.
-     *
-     * @param targetCaps The caps of the draw target that will be used to draw
-     *                   the path.
-     * @param path       The path to draw
-     * @param fill       The fill rule to use
-     * @param antiAlias  True if anti-aliasing is required.
-     *
-     * @return  true if the path can be drawn by this object, false otherwise.
-     */
-    virtual bool canDrawPath(const GrDrawTarget::Caps& targetCaps,
-                             const SkPath& path,
-                             GrPathFill fill,
-                             bool antiAlias) const = 0;
+    GrPathRenderer();
 
     /**
      * For complex clips Gr uses the stencil buffer. The path renderer must be
@@ -90,54 +72,37 @@
      *         returns true the drawPathToStencil will be used when rendering
      *         clips.
      */
-    virtual bool requiresStencilPass(const GrDrawTarget* target,
-                                     const SkPath& path,
-                                     GrPathFill fill) const { return false; }
+    virtual bool requiresStencilPass(const SkPath& path,
+                                     GrPathFill fill,
+                                     const GrDrawTarget* target) const {
+        return false;
+    }
 
-    /**
-     * Sets the path to render and target to render into. All calls to drawPath
-     * and drawPathToStencil must occur between setPath and clearPath. The
-     * path cannot be modified externally between setPath and clearPath. The
-     * path may be drawn several times (e.g. tiled supersampler). The target's
-     * state may change between setPath and drawPath* calls. However, if the
-     * path renderer specified vertices/indices during setPath or drawPath*
-     * they will still be set at subsequent drawPath* calls until the next
-     * clearPath. The target's draw state may change between drawPath* calls
-     * so if the subclass does any caching of tesselation, etc. then it must
-     * validate that target parameters that guided the decisions still hold.
-     *
-     * @param target                the target to draw into.
-     * @param path                  the path to draw.
-     * @param fill                  the fill rule to apply.
-     * @param antiAlias             perform antiAliasing when drawing the path.
-     * @param translate             optional additional translation to apply to
-     *                              the path. NULL means (0,0).
-     */
-    void setPath(GrDrawTarget* target,
-                 const SkPath* path,
-                 GrPathFill fill,
-                 bool antiAlias,
-                 const GrPoint* translate);
-
-    /**
-     * Notifies path renderer that path set in setPath is no longer in use.
-     */
-    void clearPath();
-
+    virtual bool canDrawPath(const SkPath& path,
+                             GrPathFill fill,
+                             const GrDrawTarget* target,
+                             bool antiAlias) const = 0;
     /**
      * Draws the path into the draw target. If requiresStencilBuffer returned
      * false then the target may be setup for stencil rendering (since the 
      * path renderer didn't claim that it needs to use the stencil internally).
      *
-     * Only called between setPath / clearPath.
-     *
      * @param stages                bitfield that indicates which stages are
      *                              in use. All enabled stages expect positions
      *                              as texture coordinates. The path renderer
      *                              use the remaining stages for its path
      *                              filling algorithm.
      */
-    virtual void drawPath(GrDrawState::StageMask stageMask) = 0;
+    virtual bool drawPath(const SkPath& path,
+                          GrPathFill fill,
+                          const GrVec* translate,
+                          GrDrawTarget* target,
+                          GrDrawState::StageMask stageMask,
+                          bool antiAlias) {
+        GrAssert(this->canDrawPath(path, fill, target, antiAlias));
+        return this->onDrawPath(path, fill, translate,
+                                target, stageMask, antiAlias);
+    }
 
     /**
      * Draws the path to the stencil buffer. Assume the writable stencil bits
@@ -150,64 +115,20 @@
      * The default implementation assumes the path filling algorithm doesn't
      * require a separate stencil pass and so crashes.
      *
-     * Only called between setPath / clearPath.
      */
-    virtual void drawPathToStencil() {
+    virtual void drawPathToStencil(const SkPath& path,
+                                   GrPathFill fill,
+                                   GrDrawTarget* target) {
         GrCrash("Unexpected call to drawPathToStencil.");
     }
 
-    /**
-     * Helper that sets a path and automatically remove it in destructor.
-     */
-    class AutoClearPath {
-    public:
-        AutoClearPath() {
-            fPathRenderer = NULL;
-        }
-        AutoClearPath(GrPathRenderer* pr,
-                      GrDrawTarget* target,
-                      const SkPath* path,
-                      GrPathFill fill,
-                      bool antiAlias,
-                      const GrPoint* translate) {
-            GrAssert(NULL != pr);
-            pr->setPath(target, path, fill, antiAlias, translate);
-            fPathRenderer = pr;
-        }
-        void set(GrPathRenderer* pr,
-                 GrDrawTarget* target,
-                 const SkPath* path,
-                 GrPathFill fill,
-                 bool antiAlias,
-                 const GrPoint* translate) {
-            if (NULL != fPathRenderer) {
-                fPathRenderer->clearPath();
-            }
-            GrAssert(NULL != pr);
-            pr->setPath(target, path, fill, antiAlias, translate);
-            fPathRenderer = pr;
-        }
-        ~AutoClearPath() {
-            if (NULL != fPathRenderer) {
-                fPathRenderer->clearPath();
-            }
-        }
-    private:
-        GrPathRenderer* fPathRenderer;
-    };
-
 protected:
-
-    // subclass can override these to be notified just after a path is set
-    // and just before the path is cleared.
-    virtual void pathWasSet() {}
-    virtual void pathWillClear() {}
-
-    const SkPath*               fPath;
-    GrDrawTarget*               fTarget;
-    GrPathFill                  fFill;
-    GrPoint                     fTranslate;
-    bool                        fAntiAlias;
+    virtual bool onDrawPath(const SkPath& path,
+                            GrPathFill fill,
+                            const GrVec* translate,
+                            GrDrawTarget* target,
+                            GrDrawState::StageMask stageMask,
+                            bool antiAlias) = 0;
 
 private:
 
diff --git a/src/gpu/GrPathRendererChain.cpp b/src/gpu/GrPathRendererChain.cpp
index a45437b..00f0b81 100644
--- a/src/gpu/GrPathRendererChain.cpp
+++ b/src/gpu/GrPathRendererChain.cpp
@@ -17,9 +17,8 @@
     : fInit(false)
     , fOwner(context)
     , fFlags(flags) {
-    fInit = false;
 }
-    
+
 GrPathRendererChain::~GrPathRendererChain() {
     for (int i = 0; i < fChain.count(); ++i) {
         fChain[i]->unref();
@@ -32,16 +31,15 @@
     return pr;
 }
 
-GrPathRenderer* GrPathRendererChain::getPathRenderer(
-                                        const GrDrawTarget::Caps& targetCaps,
-                                        const GrPath& path,
-                                        GrPathFill fill,
-                                        bool antiAlias) {
+GrPathRenderer* GrPathRendererChain::getPathRenderer(const SkPath& path,
+                                                     GrPathFill fill,
+                                                     const GrDrawTarget* target,
+                                                     bool antiAlias) {
     if (!fInit) {
         this->init();
     }
     for (int i = 0; i < fChain.count(); ++i) {
-        if (fChain[i]->canDrawPath(targetCaps, path, fill, antiAlias)) {
+        if (fChain[i]->canDrawPath(path, fill, target, antiAlias)) {
             return fChain[i];
         }
     }
diff --git a/src/gpu/GrPathRendererChain.h b/src/gpu/GrPathRendererChain.h
index 8f95ea3..54737cb 100644
--- a/src/gpu/GrPathRendererChain.h
+++ b/src/gpu/GrPathRendererChain.h
@@ -40,9 +40,9 @@
     // takes a ref and unrefs in destructor
     GrPathRenderer* addPathRenderer(GrPathRenderer* pr);
 
-    GrPathRenderer* getPathRenderer(const GrDrawTarget::Caps& targetCaps,
-                                    const SkPath& path,
+    GrPathRenderer* getPathRenderer(const SkPath& path,
                                     GrPathFill fill,
+                                    const GrDrawTarget* target,
                                     bool antiAlias);
 
 private:
diff --git a/src/gpu/GrPathUtils.cpp b/src/gpu/GrPathUtils.cpp
index 0a7759d..c32ee8e 100644
--- a/src/gpu/GrPathUtils.cpp
+++ b/src/gpu/GrPathUtils.cpp
@@ -8,8 +8,8 @@
 
 
 #include "GrPathUtils.h"
-
 #include "GrPoint.h"
+#include "SkGeometry.h"
 
 GrScalar GrPathUtils::scaleToleranceToSrc(GrScalar devTol,
                                           const GrMatrix& viewM,
@@ -186,3 +186,151 @@
     }
     return pointCount;
 }
+
+namespace {
+// The matrix computed for quadDesignSpaceToUVCoordsMatrix should never really
+// have perspective and we really want to avoid perspective matrix muls.
+//  However, the first two entries of the perspective row may be really close to
+// 0 and the third may not be 1 due to a scale on the entire matrix.
+inline void fixup_matrix(GrMatrix* mat) {
+#ifndef SK_SCALAR_IS_FLOAT
+    GrCrash("Expected scalar is float.");
+#endif
+     static const GrScalar gTOL = 1.f / 100.f;
+    GrAssert(GrScalarAbs(mat->get(SkMatrix::kMPersp0)) < gTOL);
+    GrAssert(GrScalarAbs(mat->get(SkMatrix::kMPersp1)) < gTOL);
+    float m33 = mat->get(SkMatrix::kMPersp2);
+    if (1.f != m33) {
+        m33 = 1.f / m33;
+        mat->setAll(m33 * mat->get(SkMatrix::kMScaleX),
+                    m33 * mat->get(SkMatrix::kMSkewX),
+                    m33 * mat->get(SkMatrix::kMTransX),
+                    m33 * mat->get(SkMatrix::kMSkewY),
+                    m33 * mat->get(SkMatrix::kMScaleY),
+                    m33 * mat->get(SkMatrix::kMTransY),
+                    0.f, 0.f, 1.f);
+    } else {
+        mat->setPerspX(0);
+        mat->setPerspY(0);
+    }
+}
+}
+
+// Compute a matrix that goes from the 2d space coordinates to UV space where
+// u^2-v = 0 specifies the quad.
+void GrPathUtils::quadDesignSpaceToUVCoordsMatrix(const SkPoint qPts[3],
+                                                  GrMatrix* matrix) {
+    // can't make this static, no cons :(
+    SkMatrix UVpts;
+#ifndef SK_SCALAR_IS_FLOAT
+    GrCrash("Expected scalar is float.");
+#endif
+    // We want M such that M * xy_pt = uv_pt
+    // We know M * control_pts = [0  1/2 1]
+    //                           [0  0   1]
+    //                           [1  1   1]
+    // We invert the control pt matrix and post concat to both sides to get M.
+    UVpts.setAll(0,   0.5f,  1.f,
+                 0,   0,     1.f,
+                 1.f, 1.f,   1.f);
+    matrix->setAll(qPts[0].fX, qPts[1].fX, qPts[2].fX,
+                   qPts[0].fY, qPts[1].fY, qPts[2].fY,
+                   1.f,        1.f,        1.f);
+    if (!matrix->invert(matrix)) {
+        // The quad is degenerate. Hopefully this is rare. Find the pts that are
+        // farthest apart to compute a line (unless it is really a pt).
+        SkScalar maxD = qPts[0].distanceToSqd(qPts[1]);
+        int maxEdge = 0;
+        SkScalar d = qPts[1].distanceToSqd(qPts[2]);
+        if (d > maxD) {
+            maxD = d;
+            maxEdge = 1;
+        }
+        d = qPts[2].distanceToSqd(qPts[0]);
+        if (d > maxD) {
+            maxD = d;
+            maxEdge = 2;
+        }
+        // We could have a tolerance here, not sure if it would improve anything
+        if (maxD > 0) {
+            // Set the matrix to give (u = 0, v = distance_to_line)
+            GrVec lineVec = qPts[(maxEdge + 1)%3] - qPts[maxEdge];
+            // when looking from the point 0 down the line we want positive
+            // distances to be to the left. This matches the non-degenerate
+            // case.
+            lineVec.setOrthog(lineVec, GrPoint::kLeft_Side);
+            lineVec.dot(qPts[0]);
+            matrix->setAll(0, 0, 0,
+                           lineVec.fX, lineVec.fY, -lineVec.dot(qPts[maxEdge]),
+                           0, 0, 1.f);
+        } else {
+            // It's a point. It should cover zero area. Just set the matrix such
+            // that (u, v) will always be far away from the quad.
+            matrix->setAll(0, 0, 100 * SK_Scalar1,
+                           0, 0, 100 * SK_Scalar1,
+                           0, 0, 1.f);
+        }
+    } else {
+        matrix->postConcat(UVpts);
+        fixup_matrix(matrix);
+    }
+}
+
+namespace {
+void convert_noninflect_cubic_to_quads(const SkPoint p[4],
+                                       SkScalar tolScale,
+                                       SkTArray<SkPoint, true>* quads,
+                                       int sublevel = 0) {
+    SkVector ab = p[1];
+    ab -= p[0];
+    SkVector dc = p[2];
+    dc -= p[3];
+
+    static const SkScalar gLengthScale = 3 * SK_Scalar1 / 2;
+    // base tolerance is 2 pixels in dev coords.
+    const SkScalar distanceSqdTol = SkScalarMul(tolScale, 1 * SK_Scalar1);
+    static const int kMaxSubdivs = 10;
+
+    ab.scale(gLengthScale);
+    dc.scale(gLengthScale);
+
+    SkVector c0 = p[0];
+    c0 += ab;
+    SkVector c1 = p[3];
+    c1 += dc;
+
+    SkScalar dSqd = c0.distanceToSqd(c1);
+    if (sublevel > kMaxSubdivs || dSqd <= distanceSqdTol) {
+        SkPoint cAvg = c0;
+        cAvg += c1;
+        cAvg.scale(SK_ScalarHalf);
+
+        SkPoint* pts = quads->push_back_n(3);
+        pts[0] = p[0];
+        pts[1] = cAvg;
+        pts[2] = p[3];
+
+        return;
+    } else {
+        SkPoint choppedPts[7];
+        SkChopCubicAtHalf(p, choppedPts);
+        convert_noninflect_cubic_to_quads(choppedPts + 0, tolScale, 
+                                          quads, sublevel + 1);
+        convert_noninflect_cubic_to_quads(choppedPts + 3, tolScale,
+                                          quads, sublevel + 1);
+    }
+}
+}
+
+void GrPathUtils::convertCubicToQuads(const GrPoint p[4],
+                                      SkScalar tolScale,
+                                      SkTArray<SkPoint, true>* quads) {
+    SkPoint chopped[10];
+    int count = SkChopCubicAtInflections(p, chopped);
+
+    for (int i = 0; i < count; ++i) {
+        SkPoint* cubic = chopped + 3*i;
+        convert_noninflect_cubic_to_quads(cubic, tolScale, quads);
+    }
+
+}
diff --git a/src/gpu/GrPathUtils.h b/src/gpu/GrPathUtils.h
index 5dc06aa..df2e16c 100644
--- a/src/gpu/GrPathUtils.h
+++ b/src/gpu/GrPathUtils.h
@@ -12,6 +12,7 @@
 
 #include "GrMatrix.h"
 #include "GrPath.h"
+#include "SkTArray.h"
 
 /**
  *  Utilities for evaluating paths.
@@ -45,6 +46,16 @@
                                  GrScalar tolSqd,
                                  GrPoint** points,
                                  uint32_t pointsLeft);
-
+    // Compute a matrix that goes from the 2d space coordinates to UV space
+    // where u^2-v = 0 specifies the quad.
+    void quadDesignSpaceToUVCoordsMatrix(const GrPoint qPts[3],
+                                         GrMatrix* matrix);
+    // Converts a cubic into a sequence of quads. If working in device space
+    // use tolScale = 1, otherwise set based on stretchiness of the matrix. The
+    // result is sets of 3 points in quads (TODO: share endpoints in returned
+    // array)
+    void convertCubicToQuads(const GrPoint p[4],
+                             SkScalar tolScale,
+                             SkTArray<SkPoint, true>* quads);
 };
 #endif
diff --git a/src/gpu/GrRenderTarget.cpp b/src/gpu/GrRenderTarget.cpp
index 7901648..ed1a018 100644
--- a/src/gpu/GrRenderTarget.cpp
+++ b/src/gpu/GrRenderTarget.cpp
@@ -41,6 +41,15 @@
                                      config, buffer, rowBytes);
 }
 
+void GrRenderTarget::resolve() {
+    // go through context so that all necessary flushing occurs
+    GrContext* context = this->getContext();
+    if (NULL == context) {
+        return;
+    }
+    context->resolveRenderTarget(this);
+}
+
 size_t GrRenderTarget::sizeInBytes() const {
     int colorBits;
     if (kUnknown_GrPixelConfig == fConfig) {
diff --git a/src/gpu/GrResourceCache.cpp b/src/gpu/GrResourceCache.cpp
index afbe9b3..ba6452d 100644
--- a/src/gpu/GrResourceCache.cpp
+++ b/src/gpu/GrResourceCache.cpp
@@ -304,14 +304,18 @@
     fMaxCount = 0;
     this->purgeAsNeeded();
 
-    GrAssert(!fCache.count());
+#if GR_DEBUG
     GrAssert(!fUnlockedEntryCount);
-    // Items may have been detached from the cache (such as the backing texture
-    // for an SkGpuDevice). The above purge would not have removed them.
-    GrAssert(fEntryCount == fClientDetachedCount);
-    GrAssert(fEntryBytes == fClientDetachedBytes);
-    GrAssert(NULL == fHead);
-    GrAssert(NULL == fTail);
+    if (!fCache.count()) {
+        // Items may have been detached from the cache (such as the backing
+        // texture for an SkGpuDevice). The above purge would not have removed
+        // them.
+        GrAssert(fEntryCount == fClientDetachedCount);
+        GrAssert(fEntryBytes == fClientDetachedBytes);
+        GrAssert(NULL == fHead);
+        GrAssert(NULL == fTail);
+    }
+#endif
 
     fMaxBytes = savedMaxBytes;
     fMaxCount = savedMaxCount;
diff --git a/src/gpu/GrResourceCache.h b/src/gpu/GrResourceCache.h
index e21c605..f86fcd2 100644
--- a/src/gpu/GrResourceCache.h
+++ b/src/gpu/GrResourceCache.h
@@ -209,6 +209,11 @@
     void setLimits(int maxResource, size_t maxResourceBytes);
 
     /**
+     * Returns the number of bytes consumed by cached resources.
+     */
+    size_t getCachedResourceBytes() const { return fEntryBytes; }
+
+    /**
      * Controls whether locks should be nestable or not.
      */
     enum LockType {
@@ -315,4 +320,3 @@
 #endif
 
 #endif
-
diff --git a/src/gpu/GrStencil.h b/src/gpu/GrStencil.h
index ae81840..143e525 100644
--- a/src/gpu/GrStencil.h
+++ b/src/gpu/GrStencil.h
@@ -84,6 +84,13 @@
     kStencilOpCount
 };
 
+enum GrStencilFlags {
+    kIsDisabled_StencilFlag      = 0x1,
+    kNotDisabled_StencilFlag     = 0x2,
+    kDoesWrite_StencilFlag       = 0x4,
+    kDoesNotWrite_StencilFlag    = 0x8,
+};
+
 /**
  * GrStencilState needs to be a class with accessors and setters so that it
  * can maintain flags related to its current state. However, we also want to
@@ -121,6 +128,45 @@
                  2*sizeof(unsigned short) + // write masks
                  sizeof(uint32_t)); // flags
 
+// This macro is used to compute the GrStencilSettingsStructs flags
+// associated to disabling. It is used both to define constant structure
+// initializers and inside GrStencilSettings::isDisabled()
+//
+#define GR_STENCIL_SETTINGS_IS_DISABLED(                                     \
+    FRONT_PASS_OP,    BACK_PASS_OP,                                          \
+    FRONT_FAIL_OP,    BACK_FAIL_OP,                                          \
+    FRONT_FUNC,       BACK_FUNC)                                             \
+    ((FRONT_PASS_OP) == kKeep_StencilOp &&                                   \
+     (BACK_PASS_OP)  == kKeep_StencilOp &&                                   \
+     (FRONT_FAIL_OP) == kKeep_StencilOp &&                                   \
+     (BACK_FAIL_OP)  == kKeep_StencilOp &&                                   \
+     (FRONT_FUNC)    == kAlways_StencilFunc &&                               \
+     (BACK_FUNC)     == kAlways_StencilFunc)
+
+#define GR_STENCIL_SETTINGS_DOES_WRITE(                                      \
+    FRONT_PASS_OP,    BACK_PASS_OP,                                          \
+    FRONT_FAIL_OP,    BACK_FAIL_OP,                                          \
+    FRONT_FUNC,       BACK_FUNC)                                             \
+    (!(((FRONT_FUNC) == kNever_StencilFunc  ||                               \
+        (FRONT_PASS_OP) == kKeep_StencilOp)  &&                              \
+       ((BACK_FUNC) == kNever_StencilFunc  ||                                \
+        (BACK_PASS_OP)  == kKeep_StencilOp) &&                               \
+       ((FRONT_FUNC) == kAlways_StencilFunc ||                               \
+        (FRONT_FAIL_OP) == kKeep_StencilOp) &&                               \
+       ((BACK_FUNC)  == kAlways_StencilFunc ||                               \
+        (BACK_FAIL_OP)  == kKeep_StencilOp)))
+
+#define GR_STENCIL_SETTINGS_DEFAULT_FLAGS(                                   \
+    FRONT_PASS_OP,    BACK_PASS_OP,                                          \
+    FRONT_FAIL_OP,    BACK_FAIL_OP,                                          \
+    FRONT_FUNC,       BACK_FUNC)                                             \
+  ((GR_STENCIL_SETTINGS_IS_DISABLED(FRONT_PASS_OP,BACK_PASS_OP,              \
+      FRONT_FAIL_OP,BACK_FAIL_OP,FRONT_FUNC,BACK_FUNC) ?                     \
+      kIsDisabled_StencilFlag : kNotDisabled_StencilFlag) |                  \
+   (GR_STENCIL_SETTINGS_DOES_WRITE(FRONT_PASS_OP,BACK_PASS_OP,               \
+      FRONT_FAIL_OP,BACK_FAIL_OP,FRONT_FUNC,BACK_FUNC) ?                     \
+      kDoesWrite_StencilFlag : kDoesNotWrite_StencilFlag))
+
 /**
  * Class representing stencil state.
  */
@@ -183,42 +229,36 @@
         memset(this, 0, sizeof(*this));
         GR_STATIC_ASSERT(0 == kKeep_StencilOp);
         GR_STATIC_ASSERT(0 == kAlways_StencilFunc);
-        fFlags = kIsDisabled_Flag | kDoesNotWrite_Flag;
+        fFlags = kIsDisabled_StencilFlag | kDoesNotWrite_StencilFlag;
     }
 
     bool isDisabled() const {
-        if (fFlags & kIsDisabled_Flag) {
+        if (fFlags & kIsDisabled_StencilFlag) {
             return true;
         }
-        if (fFlags & kNotDisabled_Flag) {
+        if (fFlags & kNotDisabled_StencilFlag) {
             return false;
         }
-        bool disabled = kKeep_StencilOp == fFrontPassOp   &&
-                        kKeep_StencilOp == fBackPassOp    &&
-                        kKeep_StencilOp == fFrontFailOp   &&
-                        kKeep_StencilOp == fBackFailOp   &&
-                        kAlways_StencilFunc == fFrontFunc &&
-                        kAlways_StencilFunc == fBackFunc;
-        fFlags |= disabled ? kIsDisabled_Flag : kNotDisabled_Flag;
+        bool disabled = GR_STENCIL_SETTINGS_IS_DISABLED(
+                            fFrontPassOp, fBackPassOp,
+                            fFrontFailOp, fBackFailOp,
+                            fFrontFunc ,fBackFunc);
+        fFlags |= disabled ? kIsDisabled_StencilFlag : kNotDisabled_StencilFlag;
         return disabled;
     }
 
     bool doesWrite() const {
-        if (fFlags & kDoesWrite_Flag) {
+        if (fFlags & kDoesWrite_StencilFlag) {
             return true;
         }
-        if (fFlags & kDoesNotWrite_Flag) {
+        if (fFlags & kDoesNotWrite_StencilFlag) {
             return false;
         }
-        bool writes = !((kNever_StencilFunc == fFrontFunc ||
-                         kKeep_StencilOp == fFrontPassOp)  &&
-                        (kNever_StencilFunc == fBackFunc ||
-                         kKeep_StencilOp == fBackPassOp)    &&
-                        (kAlways_StencilFunc == fFrontFunc ||
-                         kKeep_StencilOp == fFrontFailOp)  &&
-                        (kAlways_StencilFunc == fBackFunc ||
-                         kKeep_StencilOp == fBackFailOp));
-        fFlags |= writes ? kDoesWrite_Flag : kDoesNotWrite_Flag;
+        bool writes = GR_STENCIL_SETTINGS_DOES_WRITE(
+                        fFrontPassOp, fBackPassOp,
+                        fFrontFailOp, fBackFailOp,
+                        fFrontFunc, fBackFunc);
+        fFlags |= writes ? kDoesWrite_StencilFlag : kDoesNotWrite_StencilFlag;
         return writes;
     }
     
@@ -251,13 +291,6 @@
 private:
     friend class GrGpu;
     enum {
-        kIsDisabled_Flag      = 0x1,
-        kNotDisabled_Flag     = 0x2,
-        kDoesWrite_Flag       = 0x4,
-        kDoesNotWrite_Flag    = 0x8,
-    };
-
-    enum {
         kMaxStencilClipPasses = 2  // maximum number of passes to add a clip 
                                    // element to the stencil buffer.
     };
@@ -295,14 +328,14 @@
 
 GR_STATIC_ASSERT(sizeof(GrStencilSettingsStruct) == sizeof(GrStencilSettings));
 
-#define GR_STATIC_CONST_STENCIL(NAME,                                        \
+#define GR_STATIC_CONST_STENCIL_STRUCT(STRUCT_NAME,                          \
     FRONT_PASS_OP,    BACK_PASS_OP,                                          \
     FRONT_FAIL_OP,    BACK_FAIL_OP,                                          \
     FRONT_FUNC,       BACK_FUNC,                                             \
     FRONT_MASK,       BACK_MASK,                                             \
     FRONT_REF,        BACK_REF,                                              \
     FRONT_WRITE_MASK, BACK_WRITE_MASK)                                       \
-    static const GrStencilSettingsStruct NAME ## _STRUCT = {                 \
+    static const GrStencilSettingsStruct STRUCT_NAME = {                     \
         (FRONT_PASS_OP),    (BACK_PASS_OP),                                  \
         (FRONT_FAIL_OP),    (BACK_FAIL_OP),                                  \
         (FRONT_FUNC),       (BACK_FUNC),                                     \
@@ -310,14 +343,39 @@
         (FRONT_MASK),       (BACK_MASK),                                     \
         (FRONT_REF),        (BACK_REF),                                      \
         (FRONT_WRITE_MASK), (BACK_WRITE_MASK),                               \
-        0                                                                    \
-    };                                                                       \
+        GR_STENCIL_SETTINGS_DEFAULT_FLAGS(                                   \
+            FRONT_PASS_OP, BACK_PASS_OP, FRONT_FAIL_OP, BACK_FAIL_OP,        \
+            FRONT_FUNC, BACK_FUNC)                                           \
+    };
+
+#define GR_CONST_STENCIL_SETTINGS_PTR_FROM_STRUCT_PTR(STRUCT_PTR)            \
+    reinterpret_cast<const GrStencilSettings*>(STRUCT_PTR)
+
+#define GR_STATIC_CONST_SAME_STENCIL_STRUCT(STRUCT_NAME,                     \
+    PASS_OP, FAIL_OP, FUNC, MASK, REF, WRITE_MASK)                           \
+    GR_STATIC_CONST_STENCIL_STRUCT(STRUCT_NAME, (PASS_OP), (PASS_OP),        \
+    (FAIL_OP),(FAIL_OP), (FUNC), (FUNC), (MASK), (MASK), (REF), (REF),       \
+    (WRITE_MASK),(WRITE_MASK))
+
+#define GR_STATIC_CONST_STENCIL(NAME,                                        \
+    FRONT_PASS_OP,    BACK_PASS_OP,                                          \
+    FRONT_FAIL_OP,    BACK_FAIL_OP,                                          \
+    FRONT_FUNC,       BACK_FUNC,                                             \
+    FRONT_MASK,       BACK_MASK,                                             \
+    FRONT_REF,        BACK_REF,                                              \
+    FRONT_WRITE_MASK, BACK_WRITE_MASK)                                       \
+    GR_STATIC_CONST_STENCIL_STRUCT(NAME ## _STRUCT,                          \
+    (FRONT_PASS_OP),(BACK_PASS_OP),(FRONT_FAIL_OP),(BACK_FAIL_OP),           \
+    (FRONT_FUNC),(BACK_FUNC),(FRONT_MASK),(BACK_MASK),                       \
+    (FRONT_REF),(BACK_REF),(FRONT_WRITE_MASK),(BACK_WRITE_MASK))             \
     static const GrStencilSettings& NAME =                                   \
-        *reinterpret_cast<const GrStencilSettings*>(&(NAME ## _STRUCT))
-#endif
+        *GR_CONST_STENCIL_SETTINGS_PTR_FROM_STRUCT_PTR(&(NAME ## _STRUCT));
+
 
 #define GR_STATIC_CONST_SAME_STENCIL(NAME,                                   \
     PASS_OP, FAIL_OP, FUNC, MASK, REF, WRITE_MASK)                           \
     GR_STATIC_CONST_STENCIL(NAME, (PASS_OP), (PASS_OP), (FAIL_OP),           \
     (FAIL_OP), (FUNC), (FUNC), (MASK), (MASK), (REF), (REF), (WRITE_MASK),   \
     (WRITE_MASK))
+
+#endif
diff --git a/src/gpu/GrTesselatedPathRenderer.cpp b/src/gpu/GrTesselatedPathRenderer.cpp
index f6fcdef..3823bbd 100644
--- a/src/gpu/GrTesselatedPathRenderer.cpp
+++ b/src/gpu/GrTesselatedPathRenderer.cpp
@@ -347,20 +347,26 @@
     return edges->count();
 }
 
-void GrTesselatedPathRenderer::drawPath(GrDrawState::StageMask stageMask) {
-    GrDrawTarget::AutoStateRestore asr(fTarget);
-    GrDrawState* drawState = fTarget->drawState();
+bool GrTesselatedPathRenderer::onDrawPath(const SkPath& path,
+                                          GrPathFill fill,
+                                          const GrVec* translate,
+                                          GrDrawTarget* target,
+                                          GrDrawState::StageMask stageMask,
+                                          bool antiAlias) {
+
+    GrDrawTarget::AutoStateRestore asr(target);
+    GrDrawState* drawState = target->drawState();
     // face culling doesn't make sense here
     GrAssert(GrDrawState::kBoth_DrawFace == drawState->getDrawFace());
 
     GrMatrix viewM = drawState->getViewMatrix();
 
     GrScalar tol = GR_Scalar1;
-    tol = GrPathUtils::scaleToleranceToSrc(tol, viewM, fPath->getBounds());
+    tol = GrPathUtils::scaleToleranceToSrc(tol, viewM, path.getBounds());
     GrScalar tolSqd = GrMul(tol, tol);
 
     int subpathCnt;
-    int maxPts = GrPathUtils::worstCasePointCount(*fPath, &subpathCnt, tol);
+    int maxPts = GrPathUtils::worstCasePointCount(path, &subpathCnt, tol);
 
     GrVertexLayout layout = 0;
     for (int s = 0; s < GrDrawState::kNumStages; ++s) {
@@ -369,13 +375,13 @@
         }
     }
 
-    bool inverted = GrIsFillInverted(fFill);
+    bool inverted = GrIsFillInverted(fill);
     if (inverted) {
         maxPts += 4;
         subpathCnt++;
     }
     if (maxPts > USHRT_MAX) {
-        return;
+        return false;
     }
     SkAutoSTMalloc<8, GrPoint> baseMem(maxPts);
     GrPoint* base = baseMem;
@@ -385,7 +391,7 @@
     SkAutoSTMalloc<8, uint16_t> subpathVertCount(subpathCnt);
 
     GrPoint pts[4];
-    SkPath::Iter iter(*fPath, false);
+    SkPath::Iter iter(path, false);
 
     bool first = true;
     int subpath = 0;
@@ -427,9 +433,9 @@
         first = false;
     }
 FINISHED:
-    if (0 != fTranslate.fX || 0 != fTranslate.fY) {
+    if (NULL != translate && 0 != translate->fX && 0 != translate->fY) {
         for (int i = 0; i < vert - base; i++) {
-            base[i].offset(fTranslate.fX, fTranslate.fY);
+            base[i].offset(translate->fX, translate->fY);
         }
     }
 
@@ -456,25 +462,25 @@
     size_t count = vert - base;
 
     if (count < 3) {
-        return;
+        return true;
     }
 
-    if (subpathCnt == 1 && !inverted && fPath->isConvex()) {
-        if (fAntiAlias) {
+    if (subpathCnt == 1 && !inverted && path.isConvex()) {
+        if (antiAlias) {
             GrEdgeArray edges;
             GrMatrix inverse, matrix = drawState->getViewMatrix();
             drawState->getViewInverse(&inverse);
 
             count = computeEdgesAndIntersect(matrix, inverse, base, count, &edges, 0.0f);
-            size_t maxEdges = fTarget->getMaxEdges();
+            size_t maxEdges = target->getMaxEdges();
             if (count == 0) {
-                return;
+                return true;
             }
             if (count <= maxEdges) {
                 // All edges fit; upload all edges and draw all verts as a fan
-                fTarget->setVertexSourceToArray(layout, base, count);
+                target->setVertexSourceToArray(layout, base, count);
                 drawState->setEdgeAAData(&edges[0], count);
-                fTarget->drawNonIndexed(kTriangleFan_PrimitiveType, 0, count);
+                target->drawNonIndexed(kTriangleFan_PrimitiveType, 0, count);
             } else {
                 // Upload "maxEdges" edges and verts at a time, and draw as
                 // separate fans
@@ -482,31 +488,31 @@
                     edges[i] = edges[0];
                     base[i] = base[0];
                     int size = GR_CT_MIN(count - i, maxEdges);
-                    fTarget->setVertexSourceToArray(layout, &base[i], size);
+                    target->setVertexSourceToArray(layout, &base[i], size);
                     drawState->setEdgeAAData(&edges[i], size);
-                    fTarget->drawNonIndexed(kTriangleFan_PrimitiveType, 0, size);
+                    target->drawNonIndexed(kTriangleFan_PrimitiveType, 0, size);
                 }
             }
             drawState->setEdgeAAData(NULL, 0);
         } else {
-            fTarget->setVertexSourceToArray(layout, base, count);
-            fTarget->drawNonIndexed(kTriangleFan_PrimitiveType, 0, count);
+            target->setVertexSourceToArray(layout, base, count);
+            target->drawNonIndexed(kTriangleFan_PrimitiveType, 0, count);
         }
-        return;
+        return true;
     }
 
-    if (fAntiAlias) {
+    if (antiAlias) {
         // Run the tesselator once to get the boundaries.
-        GrBoundaryTess btess(count, fill_type_to_glu_winding_rule(fFill));
+        GrBoundaryTess btess(count, fill_type_to_glu_winding_rule(fill));
         btess.addVertices(base, subpathVertCount, subpathCnt);
 
         GrMatrix inverse, matrix = drawState->getViewMatrix();
         if (!drawState->getViewInverse(&inverse)) {
-            return;
+            return false;
         }
 
         if (btess.vertices().count() > USHRT_MAX) {
-            return;
+            return false;
         }
 
         // Inflate the boundary, and run the tesselator again to generate
@@ -532,7 +538,7 @@
         Sk_gluTessEndPolygon(ptess.tess());
 
         if (ptess.vertices().count() > USHRT_MAX) {
-            return;
+            return false;
         }
 
         // Draw the resulting polys and upload their edge data.
@@ -570,37 +576,34 @@
                 tri_edges[t++] = edge5;
             }
             drawState->setEdgeAAData(&tri_edges[0], t);
-            fTarget->setVertexSourceToArray(layout, &tri_verts[0], 3);
-            fTarget->drawNonIndexed(kTriangles_PrimitiveType, 0, 3);
+            target->setVertexSourceToArray(layout, &tri_verts[0], 3);
+            target->drawNonIndexed(kTriangles_PrimitiveType, 0, 3);
         }
         drawState->setEdgeAAData(NULL, 0);
         drawState->disableState(GrDrawState::kEdgeAAConcave_StateBit);
-        return;
+        return true;
     }
 
-    GrPolygonTess ptess(count, fill_type_to_glu_winding_rule(fFill));
+    GrPolygonTess ptess(count, fill_type_to_glu_winding_rule(fill));
     ptess.addVertices(base, subpathVertCount, subpathCnt);
     const GrPointArray& vertices = ptess.vertices();
     const GrIndexArray& indices = ptess.indices();
     if (indices.count() > 0) {
-        fTarget->setVertexSourceToArray(layout, vertices.begin(), vertices.count());
-        fTarget->setIndexSourceToArray(indices.begin(), indices.count());
-        fTarget->drawIndexed(kTriangles_PrimitiveType,
+        target->setVertexSourceToArray(layout, vertices.begin(), vertices.count());
+        target->setIndexSourceToArray(indices.begin(), indices.count());
+        target->drawIndexed(kTriangles_PrimitiveType,
                             0,
                             0,
                             vertices.count(),
                             indices.count());
     }
+    return true;
 }
 
-bool GrTesselatedPathRenderer::canDrawPath(const GrDrawTarget::Caps& caps,
-                                           const SkPath& path,
+bool GrTesselatedPathRenderer::canDrawPath(const SkPath& path,
                                            GrPathFill fill,
+                                           const GrDrawTarget* target,
                                            bool antiAlias) const {
     return kHairLine_PathFill != fill;
 }
 
-void GrTesselatedPathRenderer::drawPathToStencil() {
-    GrAlwaysAssert(!"multipass stencil should not be needed");
-}
-
diff --git a/src/gpu/GrTesselatedPathRenderer.h b/src/gpu/GrTesselatedPathRenderer.h
index e783958..3d12ae9 100644
--- a/src/gpu/GrTesselatedPathRenderer.h
+++ b/src/gpu/GrTesselatedPathRenderer.h
@@ -16,12 +16,17 @@
 public:
     GrTesselatedPathRenderer();
 
-    virtual void drawPath(GrDrawState::StageMask stageMask);
-    virtual bool canDrawPath(const GrDrawTarget::Caps& targetCaps,
-                             const GrPath& path,
+    virtual bool canDrawPath(const SkPath& path,
                              GrPathFill fill,
+                             const GrDrawTarget* target,
                              bool antiAlias) const SK_OVERRIDE;
-    virtual void drawPathToStencil() SK_OVERRIDE;
+
+    virtual bool onDrawPath(const SkPath& path,
+                            GrPathFill fill,
+                            const GrVec* translate,
+                            GrDrawTarget* target,
+                            GrDrawState::StageMask stageMask,
+                            bool antiAlias) SK_OVERRIDE;
 };
 
 #endif
diff --git a/src/gpu/SkGpuDevice.cpp b/src/gpu/SkGpuDevice.cpp
index 96d9fc6..af59699 100644
--- a/src/gpu/SkGpuDevice.cpp
+++ b/src/gpu/SkGpuDevice.cpp
@@ -149,26 +149,36 @@
     this->initFromRenderTarget(context, renderTarget);
 }
 
-void SkGpuDevice::initFromRenderTarget(GrContext* context, 
+void SkGpuDevice::initFromRenderTarget(GrContext* context,
                                        GrRenderTarget* renderTarget) {
     fNeedPrepareRenderTarget = false;
     fDrawProcs = NULL;
-    
+
     fContext = context;
     fContext->ref();
-    
+
     fTexture = NULL;
     fRenderTarget = NULL;
     fNeedClear = false;
-    
+
     GrAssert(NULL != renderTarget);
     fRenderTarget = renderTarget;
     fRenderTarget->ref();
     // if this RT is also a texture, hold a ref on it
     fTexture = fRenderTarget->asTexture();
     SkSafeRef(fTexture);
-
-    SkGrRenderTargetPixelRef* pr = new SkGrRenderTargetPixelRef(fRenderTarget);
+    
+    // Create a pixel ref for the underlying SkBitmap. We prefer a texture pixel
+    // ref to a render target pixel reft. The pixel ref may get ref'ed outside
+    // the device via accessBitmap. This external ref may outlive the device.
+    // Since textures own their render targets (but not vice-versa) we
+    // are ensuring that both objects will live as long as the pixel ref.
+    SkPixelRef* pr;
+    if (fTexture) {
+        pr = new SkGrTexturePixelRef(fTexture);
+    } else {
+        pr = new SkGrRenderTargetPixelRef(fRenderTarget);
+    }
     this->setPixelRef(pr, 0)->unref();
 }
 
@@ -192,7 +202,7 @@
     bm.setConfig(config, width, height);
 
 #if CACHE_LAYER_TEXTURES
-    TexType type = (kSaveLayer_Usage == usage) ? 
+    TexType type = (kSaveLayer_Usage == usage) ?
                             kSaveLayerDeviceRenderTarget_TexType :
                             kDeviceRenderTarget_TexType;
     fCache = this->lockCachedTexture(bm, NULL, type);
@@ -206,10 +216,10 @@
 #else
     const GrTextureDesc desc = {
         kRenderTarget_GrTextureFlagBit,
-        kNone_GrAALevel,
         width,
         height,
-        SkGr::Bitmap2PixelConfig(bm)
+        SkGr::Bitmap2PixelConfig(bm),
+        {0} // samples
     };
 
     fTexture = fContext->createUncachedTexture(desc, NULL, 0);
@@ -244,7 +254,7 @@
         GrAssert(NULL != fTexture);
         GrAssert(fRenderTarget == fTexture->asRenderTarget());
         fContext->unlockTexture(fCache);
-    } 
+    }
     fContext->unref();
 }
 
@@ -375,8 +385,8 @@
     }
 }
 
-SkGpuRenderTarget* SkGpuDevice::accessRenderTarget() { 
-    return (SkGpuRenderTarget*)fRenderTarget; 
+SkGpuRenderTarget* SkGpuDevice::accessRenderTarget() {
+    return (SkGpuRenderTarget*)fRenderTarget;
 }
 
 bool SkGpuDevice::bindDeviceAsTexture(GrPaint* paint) {
@@ -412,6 +422,7 @@
 
     grPaint->fDither    = skPaint.isDither();
     grPaint->fAntiAlias = skPaint.isAntiAlias();
+    grPaint->fCoverage = 0xFF;
 
     SkXfermode::Coeff sm = SkXfermode::kOne_Coeff;
     SkXfermode::Coeff dm = SkXfermode::kISA_Coeff;
@@ -687,6 +698,36 @@
     }
 }
 
+static GrTexture* applyMorphology(GrContext* context, GrTexture* texture,
+                                  const GrRect& srcRect,
+                                  GrTexture* temp1, GrTexture* temp2,
+                                  GrSamplerState::Filter filter,
+                                  SkISize radius) {
+    GrRenderTarget* oldRenderTarget = context->getRenderTarget();
+    GrAutoMatrix avm(context, GrMatrix::I());
+    GrClip oldClip = context->getClip();
+    context->setClip(GrRect::MakeWH(texture->width(), texture->height()));
+    if (radius.fWidth > 0) {
+        context->setRenderTarget(temp1->asRenderTarget());
+        context->applyMorphology(texture, srcRect, radius.fWidth, filter,
+                                 GrSamplerState::kX_FilterDirection);
+        SkIRect clearRect = SkIRect::MakeXYWH(
+            srcRect.fLeft, srcRect.fBottom,
+            srcRect.width(), radius.fHeight);
+        context->clear(&clearRect, 0x0);
+        texture = temp1;
+    }
+    if (radius.fHeight > 0) {
+        context->setRenderTarget(temp2->asRenderTarget());
+        context->applyMorphology(texture, srcRect, radius.fHeight, filter,
+                                 GrSamplerState::kY_FilterDirection);
+        texture = temp2;
+    }
+    context->setRenderTarget(oldRenderTarget);
+    context->setClip(oldClip);
+    return texture;
+}
+
 static void buildKernel(float sigma, float* kernel, int kernelWidth) {
     int halfWidth = (kernelWidth - 1) / 2;
     float sum = 0.0f;
@@ -726,7 +767,7 @@
 // Apply a Gaussian blur to srcTexture by sigmaX and sigmaY, within the given
 // rect.
 // temp1 and temp2 are used for allocation of intermediate textures.
-// If temp2 is non-NULL, srcTexture will be untouched, and the return 
+// If temp2 is non-NULL, srcTexture will be untouched, and the return
 // value will be either temp1 or temp2.
 // If temp2 is NULL, srcTexture will be overwritten with intermediate
 // results, and the return value will either be temp1 or srcTexture.
@@ -754,10 +795,10 @@
 
     const GrTextureDesc desc = {
         kRenderTarget_GrTextureFlagBit | kNoStencil_GrTextureFlagBit,
-        kNone_GrAALevel,
         srcRect.width(),
         srcRect.height(),
-        kRGBA_8888_GrPixelConfig
+        kRGBA_8888_GrPixelConfig,
+        {0} // samples 
     };
 
     temp1->set(context, desc);
@@ -773,7 +814,7 @@
                                                    srcTexture->height());
         context->setRenderTarget(dstTexture->asRenderTarget());
         SkRect dstRect(srcRect);
-        scaleRect(&dstRect, i < scaleFactorX ? 0.5f : 1.0f, 
+        scaleRect(&dstRect, i < scaleFactorX ? 0.5f : 1.0f,
                             i < scaleFactorY ? 0.5f : 1.0f);
         paint.setTexture(0, srcTexture);
         context->drawRectToRect(paint, dstRect, srcRect);
@@ -797,7 +838,8 @@
         }
 
         context->setRenderTarget(dstTexture->asRenderTarget());
-        context->convolveInX(srcTexture, srcRect, kernelX, kernelWidthX);
+        context->convolve(srcTexture, srcRect, kernelX, kernelWidthX,
+                          GrSamplerState::kX_FilterDirection);
         SkTSwap(srcTexture, dstTexture);
         if (temp2 && dstTexture == origTexture) dstTexture = temp2->texture();
     }
@@ -816,7 +858,8 @@
         }
 
         context->setRenderTarget(dstTexture->asRenderTarget());
-        context->convolveInY(srcTexture, srcRect, kernelY, kernelWidthY);
+        context->convolve(srcTexture, srcRect, kernelY, kernelWidthY,
+                          GrSamplerState::kY_FilterDirection);
         SkTSwap(srcTexture, dstTexture);
         if (temp2 && dstTexture == origTexture) dstTexture = temp2->texture();
     }
@@ -889,12 +932,12 @@
     srcRect.offset(offset);
     const GrTextureDesc desc = {
         kRenderTarget_GrTextureFlagBit,
-        kNone_GrAALevel,
         srcRect.width(),
         srcRect.height(),
         // We actually only need A8, but it often isn't supported as a
         // render target
-        kRGBA_8888_PM_GrPixelConfig
+        kRGBA_8888_PM_GrPixelConfig,
+        {0} // samples
     };
 
     GrAutoScratchTexture pathEntry(context, desc);
@@ -963,7 +1006,7 @@
     }
     context->setRenderTarget(oldRenderTarget);
     context->setClip(oldClip);
-    
+
     if (grp->hasTextureOrMask()) {
         GrMatrix inverse;
         if (!matrix.invert(&inverse)) {
@@ -1021,10 +1064,10 @@
 
     const GrTextureDesc desc = {
         kNone_GrTextureFlags,
-        kNone_GrAALevel,
         dstM.fBounds.width(),
         dstM.fBounds.height(),
-        kAlpha_8_GrPixelConfig
+        kAlpha_8_GrPixelConfig,
+        {0}, // samples
     };
 
     GrAutoScratchTexture ast(context, desc);
@@ -1061,32 +1104,22 @@
 }
 
 void SkGpuDevice::drawPath(const SkDraw& draw, const SkPath& origSrcPath,
-                           const SkPaint& origPaint, const SkMatrix* prePathMatrix,
+                           const SkPaint& paint, const SkMatrix* prePathMatrix,
                            bool pathIsMutable) {
     CHECK_SHOULD_DRAW(draw);
 
     bool             doFill = true;
-    SkTLazy<SkPaint> lazyPaint;
-    const SkPaint* paint = &origPaint;
-    
-    // can we cheat, and threat a thin stroke as a hairline (w/ modulated alpha)
+
+    SkScalar coverage = SK_Scalar1;
+    // can we cheat, and threat a thin stroke as a hairline w/ coverage
     // if we can, we draw lots faster (raster device does this same test)
-    {
-        SkAlpha newAlpha;
-        if (SkDrawTreatAsHairline(*paint, *draw.fMatrix, &newAlpha)) {
-            lazyPaint.set(*paint);
-            lazyPaint.get()->setAlpha(newAlpha);
-            lazyPaint.get()->setStrokeWidth(0);
-            paint = lazyPaint.get();
-            doFill = false;
-        }
+    if (SkDrawTreatAsHairline(paint, *draw.fMatrix, &coverage)) {
+        doFill = false;
     }
-    // must reference paint from here down, and not origPaint
-    // since we may have change the paint (using lazyPaint for storage)
-    
+
     GrPaint grPaint;
     SkAutoCachedTexture act;
-    if (!this->skPaint2GrPaintShader(*paint,
+    if (!this->skPaint2GrPaintShader(paint,
                                      &act,
                                      *draw.fMatrix,
                                      &grPaint,
@@ -1094,6 +1127,8 @@
         return;
     }
 
+    grPaint.fCoverage = SkScalarRoundToInt(coverage * grPaint.fCoverage);
+
     // If we have a prematrix, apply it to the path, optimizing for the case
     // where the original path can in fact be modified in place (even though
     // its parameter type is const).
@@ -1115,25 +1150,25 @@
     // at this point we're done with prePathMatrix
     SkDEBUGCODE(prePathMatrix = (const SkMatrix*)0x50FF8001;)
 
-    if (doFill && (paint->getPathEffect() || 
-                   paint->getStyle() != SkPaint::kFill_Style)) {
+    if (paint.getPathEffect() ||
+        (doFill && paint.getStyle() != SkPaint::kFill_Style)) {
         // it is safe to use tmpPath here, even if we already used it for the
         // prepathmatrix, since getFillPath can take the same object for its
         // input and output safely.
-        doFill = paint->getFillPath(*pathPtr, &tmpPath);
+        doFill = paint.getFillPath(*pathPtr, &tmpPath);
         pathPtr = &tmpPath;
     }
 
-    if (paint->getMaskFilter()) {
+    if (paint.getMaskFilter()) {
         // avoid possibly allocating a new path in transform if we can
         SkPath* devPathPtr = pathIsMutable ? pathPtr : &tmpPath;
 
         // transform the path into device space
         pathPtr->transform(*draw.fMatrix, devPathPtr);
-        if (!drawWithGPUMaskFilter(fContext, *devPathPtr, paint->getMaskFilter(),
+        if (!drawWithGPUMaskFilter(fContext, *devPathPtr, paint.getMaskFilter(),
                                    *draw.fMatrix, *draw.fClip, draw.fBounder,
                                    &grPaint)) {
-            drawWithMaskFilter(fContext, *devPathPtr, paint->getMaskFilter(),
+            drawWithMaskFilter(fContext, *devPathPtr, paint.getMaskFilter(),
                                *draw.fMatrix, *draw.fClip, draw.fBounder,
                                &grPaint);
         }
@@ -1173,7 +1208,7 @@
     return tilesX * tilesY;
 }
 
-inline int determine_tile_size(const SkBitmap& bitmap, 
+inline int determine_tile_size(const SkBitmap& bitmap,
                                const SkIRect* srcRectPtr,
                                int maxTextureSize) {
     static const int kSmallTileSize = 1 << 10;
@@ -1297,7 +1332,7 @@
         ScalarRect.set(srcRect);
 
         // Transform 'm' needs to be concatenated to the draw matrix,
-        // rather than transforming the primitive directly, so that 'm' will 
+        // rather than transforming the primitive directly, so that 'm' will
         // also affect the behavior of the mask filter.
         SkMatrix drawMatrix;
         drawMatrix.setConcat(*draw.fMatrix, m);
@@ -1424,7 +1459,7 @@
                       GrFixedToScalar((srcRect.fBottom << 16) / bitmap.height()));
 
     if (GrSamplerState::kNearest_Filter != sampler->getFilter() &&
-        (srcRect.width() < bitmap.width() || 
+        (srcRect.width() < bitmap.width() ||
          srcRect.height() < bitmap.height())) {
         // If drawing a subrect of the bitmap and filtering is enabled,
         // use a constrained texture domain to avoid color bleeding
@@ -1478,6 +1513,7 @@
 
     SkImageFilter* imageFilter = paint.getImageFilter();
     SkSize blurSize;
+    SkISize radius;
     if (NULL != imageFilter && imageFilter->asABlur(&blurSize)) {
         GrAutoScratchTexture temp1, temp2;
         GrTexture* blurTexture = gaussianBlur(fContext,
@@ -1487,6 +1523,32 @@
                                               blurSize.height());
         texture = blurTexture;
         grPaint.setTexture(kBitmapTextureIdx, texture);
+    } else if (NULL != imageFilter && imageFilter->asADilate(&radius)) {
+        const GrTextureDesc desc = {
+            kRenderTarget_GrTextureFlagBit,
+            w,
+            h,
+            kRGBA_8888_PM_GrPixelConfig,
+            {0} // samples
+        };
+        GrAutoScratchTexture temp1(fContext, desc), temp2(fContext, desc);
+        texture = applyMorphology(fContext, texture, GrRect::MakeWH(w, h),
+                                  temp1.texture(), temp2.texture(),
+                                  GrSamplerState::kDilate_Filter, radius);
+        grPaint.setTexture(kBitmapTextureIdx, texture);
+    } else if (NULL != imageFilter && imageFilter->asAnErode(&radius)) {
+        const GrTextureDesc desc = {
+            kRenderTarget_GrTextureFlagBit,
+            w,
+            h,
+            kRGBA_8888_PM_GrPixelConfig,
+            {0} // samples
+        };
+        GrAutoScratchTexture temp1(fContext, desc), temp2(fContext, desc);
+        texture = applyMorphology(fContext, texture, GrRect::MakeWH(w, h),
+                                  temp1.texture(), temp2.texture(),
+                                  GrSamplerState::kErode_Filter, radius);
+        grPaint.setTexture(kBitmapTextureIdx, texture);
     } else {
         grPaint.setTexture(kBitmapTextureIdx, texture);
     }
@@ -1538,7 +1600,8 @@
                               const SkMatrix& ctm,
                               SkBitmap* result, SkIPoint* offset) {
     SkSize size;
-    if (!filter->asABlur(&size)) {
+    SkISize radius;
+    if (!filter->asABlur(&size) && !filter->asADilate(&radius) && !filter->asAnErode(&radius)) {
         return false;
     }
     SkDevice* dev = this->createCompatibleDevice(SkBitmap::kARGB_8888_Config,
@@ -1580,7 +1643,7 @@
     if (NULL == texs) {
         if (!this->skPaint2GrPaintNoShader(paint,
                                            false,
-                                           &grPaint, 
+                                           &grPaint,
                                            NULL == colors)) {
             return;
         }
@@ -1766,7 +1829,7 @@
 }
 
 void SkGpuDevice::flush() {
-    fContext->flush(false);
+    fContext->resolveRenderTarget(fRenderTarget);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1780,10 +1843,10 @@
     if (kBitmap_TexType != type) {
         const GrTextureDesc desc = {
             kRenderTarget_GrTextureFlagBit,
-            kNone_GrAALevel,
             bitmap.width(),
             bitmap.height(),
-            SkGr::Bitmap2PixelConfig(bitmap)
+            SkGr::Bitmap2PixelConfig(bitmap),
+            {0} // samples
         };
         GrContext::ScratchTexMatch match;
         if (kSaveLayerDeviceRenderTarget_TexType == type) {
@@ -1800,7 +1863,7 @@
         if (!bitmap.isVolatile()) {
             GrContext::TextureKey key = bitmap.getGenerationID();
             key |= ((uint64_t) bitmap.pixelRefOffset()) << 32;
-        
+
             entry = ctx->findAndLockTexture(key, bitmap.width(),
                                             bitmap.height(), sampler);
             if (NULL == entry.texture()) {
@@ -1833,11 +1896,11 @@
 }
 
 
-SkDevice* SkGpuDevice::onCreateCompatibleDevice(SkBitmap::Config config, 
-                                                int width, int height, 
+SkDevice* SkGpuDevice::onCreateCompatibleDevice(SkBitmap::Config config,
+                                                int width, int height,
                                                 bool isOpaque,
                                                 Usage usage) {
-    return SkNEW_ARGS(SkGpuDevice,(this->context(), config, 
+    return SkNEW_ARGS(SkGpuDevice,(this->context(), config,
                                    width, height, usage));
 }
 
diff --git a/src/gpu/SkGr.cpp b/src/gpu/SkGr.cpp
index cab9d46..519475c 100644
--- a/src/gpu/SkGr.cpp
+++ b/src/gpu/SkGr.cpp
@@ -73,10 +73,10 @@
 
     GrTextureDesc desc = {
         kNone_GrTextureFlags,
-        kNone_GrAALevel,
         bitmap->width(),
         bitmap->height(),
-        SkGr::Bitmap2PixelConfig(*bitmap)
+        SkGr::Bitmap2PixelConfig(*bitmap),
+        {0} // samples
     };
 
     if (SkBitmap::kIndex8_Config == bitmap->config()) {
diff --git a/src/gpu/SkGrTexturePixelRef.cpp b/src/gpu/SkGrTexturePixelRef.cpp
index 045ddab..c81e9c0 100644
--- a/src/gpu/SkGrTexturePixelRef.cpp
+++ b/src/gpu/SkGrTexturePixelRef.cpp
@@ -16,7 +16,7 @@
 
 // since we call lockPixels recursively on fBitmap, we need a distinct mutex,
 // to avoid deadlock with the default one provided by SkPixelRef.
-static SkMutex  gROLockPixelsPixelRefMutex;
+SK_DECLARE_STATIC_MUTEX(gROLockPixelsPixelRefMutex);
 
 SkROLockPixelsPixelRef::SkROLockPixelsPixelRef() : INHERITED(&gROLockPixelsPixelRefMutex) {
 }
@@ -63,7 +63,7 @@
     desc.fHeight = texture->height();
     desc.fFlags = kRenderTarget_GrTextureFlagBit | kNoStencil_GrTextureFlagBit;
     desc.fConfig = SkGr::BitmapConfig2PixelConfig(dstConfig, false);
-    desc.fAALevel = kNone_GrAALevel;
+    desc.fSampleCnt = 0;
 
     GrTexture* dst = context->createUncachedTexture(desc, NULL, 0);
     if (NULL == dst) {
diff --git a/src/gpu/android/GrGLCreateNativeInterface_android.cpp b/src/gpu/android/GrGLCreateNativeInterface_android.cpp
index 5147627..db629f2 100644
--- a/src/gpu/android/GrGLCreateNativeInterface_android.cpp
+++ b/src/gpu/android/GrGLCreateNativeInterface_android.cpp
@@ -4,7 +4,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
-#include "GrGLInterface.h"
+#include "gl/GrGLInterface.h"
 
 #ifndef GL_GLEXT_PROTOTYPES
 #define GL_GLEXT_PROTOTYPES
diff --git a/src/gpu/android/SkNativeGLContext_android.cpp b/src/gpu/android/SkNativeGLContext_android.cpp
index eb58c27..dd444df 100644
--- a/src/gpu/android/SkNativeGLContext_android.cpp
+++ b/src/gpu/android/SkNativeGLContext_android.cpp
@@ -5,7 +5,7 @@
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  */
-#include "SkNativeGLContext.h"
+#include "gl/SkNativeGLContext.h"
 
 SkNativeGLContext::AutoContextRestore::AutoContextRestore() {
     fOldEGLContext = eglGetCurrentContext();
diff --git a/src/gpu/app-android.cpp b/src/gpu/app-android.cpp
index ae8b7dd..2dad1f9 100644
--- a/src/gpu/app-android.cpp
+++ b/src/gpu/app-android.cpp
@@ -17,7 +17,7 @@
 #include "SkString.h"
 #include "SkTime.h"
 
-#include "GrGLConfig.h"
+#include "gl/GrGLConfig.h"
 
 static GrContext* make_context() {
     SkDebugf("---- before create\n");
diff --git a/src/gpu/gl/GrGLCaps.cpp b/src/gpu/gl/GrGLCaps.cpp
new file mode 100644
index 0000000..d690ff3
--- /dev/null
+++ b/src/gpu/gl/GrGLCaps.cpp
@@ -0,0 +1,295 @@
+
+/*
+ * Copyright 2012 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+
+#include "GrGLCaps.h"
+#include "GrGLContextInfo.h"
+
+GrGLCaps::GrGLCaps() {
+    this->reset();
+}
+
+void GrGLCaps::reset() {
+    fVerifiedColorConfigs.reset();
+    fStencilFormats.reset();
+    fStencilVerifiedColorConfigs.reset();
+    fMSFBOType = kNone_MSFBOType;
+    fMaxFragmentUniformVectors = 0;
+    fRGBA8RenderbufferSupport = false;
+    fBGRAFormatSupport = false;
+    fBGRAIsInternalFormat = false;
+    fTextureSwizzleSupport = false;
+    fUnpackRowLengthSupport = false;
+    fUnpackFlipYSupport = false;
+    fPackRowLengthSupport = false;
+    fPackFlipYSupport = false;
+    fTextureUsageSupport = false;
+    fTexStorageSupport = false;
+}
+
+GrGLCaps::GrGLCaps(const GrGLCaps& caps) {
+    *this = caps;
+}
+
+GrGLCaps& GrGLCaps::operator = (const GrGLCaps& caps) {
+    fVerifiedColorConfigs = caps.fVerifiedColorConfigs;
+    fStencilFormats = caps.fStencilFormats;
+    fStencilVerifiedColorConfigs = caps.fStencilVerifiedColorConfigs;
+    fMaxFragmentUniformVectors = caps.fMaxFragmentUniformVectors;
+    fMSFBOType = caps.fMSFBOType;
+    fRGBA8RenderbufferSupport = caps.fRGBA8RenderbufferSupport;
+    fBGRAFormatSupport = caps.fBGRAFormatSupport;
+    fBGRAIsInternalFormat = caps.fBGRAIsInternalFormat;
+    fTextureSwizzleSupport = caps.fTextureSwizzleSupport;
+    fUnpackRowLengthSupport = caps.fUnpackRowLengthSupport;
+    fUnpackFlipYSupport = caps.fUnpackFlipYSupport;
+    fPackRowLengthSupport = caps.fPackRowLengthSupport;
+    fPackFlipYSupport = caps.fPackFlipYSupport;
+    fTextureUsageSupport = caps.fTextureUsageSupport;
+    fTexStorageSupport = caps.fTexStorageSupport;
+
+    return *this;
+}
+
+void GrGLCaps::init(const GrGLContextInfo& ctxInfo) {
+
+    this->reset();
+    if (!ctxInfo.isInitialized()) {
+        return;
+    }
+
+    const GrGLInterface* gli = ctxInfo.interface();
+    GrGLBinding binding = ctxInfo.binding();
+    GrGLVersion version = ctxInfo.version();
+
+    if (kES2_GrGLBinding == binding) {
+        GR_GL_GetIntegerv(gli, GR_GL_MAX_FRAGMENT_UNIFORM_VECTORS,
+                          &fMaxFragmentUniformVectors);
+    } else {
+        GrAssert(kDesktop_GrGLBinding == binding);
+        GrGLint max;
+        GR_GL_GetIntegerv(gli, GR_GL_MAX_FRAGMENT_UNIFORM_COMPONENTS, &max);
+        fMaxFragmentUniformVectors = max / 4;
+    }
+
+    if (kDesktop_GrGLBinding == binding) {
+        fRGBA8RenderbufferSupport = true;
+    } else {
+        fRGBA8RenderbufferSupport = ctxInfo.hasExtension("GL_OES_rgb8_rgba8") ||
+                                    ctxInfo.hasExtension("GL_ARM_rgba8");
+    }
+
+    if (kDesktop_GrGLBinding == binding) {
+        fBGRAFormatSupport = version >= GR_GL_VER(1,2) ||
+                             ctxInfo.hasExtension("GL_EXT_bgra");
+    } else {
+        bool hasBGRAExt = false;
+        if (ctxInfo.hasExtension("GL_APPLE_texture_format_BGRA8888")) {
+            fBGRAFormatSupport = true;
+        } else if (ctxInfo.hasExtension("GL_EXT_texture_format_BGRA8888")) {
+            fBGRAFormatSupport = true;
+            fBGRAIsInternalFormat = true;
+        }
+        GrAssert(fBGRAFormatSupport ||
+                 kSkia8888_PM_GrPixelConfig != kBGRA_8888_PM_GrPixelConfig);
+    }
+
+    if (kDesktop_GrGLBinding == binding) {
+        fTextureSwizzleSupport = version >= GR_GL_VER(3,3) ||
+                                 ctxInfo.hasExtension("GL_ARB_texture_swizzle");
+    } else {
+        fTextureSwizzleSupport = false;
+    }
+
+    if (kDesktop_GrGLBinding == binding) {
+        fUnpackRowLengthSupport = true;
+        fUnpackFlipYSupport = false;
+        fPackRowLengthSupport = true;
+        fPackFlipYSupport = false;
+    } else {
+        fUnpackRowLengthSupport =ctxInfo.hasExtension("GL_EXT_unpack_subimage");
+        fUnpackFlipYSupport = ctxInfo.hasExtension("GL_CHROMIUM_flipy");
+        // no extension for pack row length
+        fPackRowLengthSupport = false;
+        fPackFlipYSupport =
+            ctxInfo.hasExtension("GL_ANGLE_pack_reverse_row_order");
+    }
+
+    fTextureUsageSupport = (kES2_GrGLBinding == binding) &&
+                            ctxInfo.hasExtension("GL_ANGLE_texture_usage");
+
+    // Tex storage is in desktop 4.2 and can be an extension to desktop or ES.
+    fTexStorageSupport = (kDesktop_GrGLBinding == binding &&
+                          version >= GR_GL_VER(4,2)) ||
+                         ctxInfo.hasExtension("GL_ARB_texture_storage") ||
+                         ctxInfo.hasExtension("GL_EXT_texture_storage");
+
+    this->initFSAASupport(ctxInfo);
+    this->initStencilFormats(ctxInfo);
+}
+
+void GrGLCaps::initFSAASupport(const GrGLContextInfo& ctxInfo) {
+
+    fMSFBOType = kNone_MSFBOType;
+    if (kDesktop_GrGLBinding != ctxInfo.binding()) {
+       if (ctxInfo.hasExtension("GL_CHROMIUM_framebuffer_multisample")) {
+           // chrome's extension is equivalent to the EXT msaa
+           // and fbo_blit extensions.
+           fMSFBOType = kDesktopEXT_MSFBOType;
+       } else if (ctxInfo.hasExtension("GL_APPLE_framebuffer_multisample")) {
+            fMSFBOType = kAppleES_MSFBOType;
+        }
+    } else {
+        if ((ctxInfo.version() >= GR_GL_VER(3,0)) ||
+            ctxInfo.hasExtension("GL_ARB_framebuffer_object")) {
+            fMSFBOType = GrGLCaps::kDesktopARB_MSFBOType;
+        } else if (ctxInfo.hasExtension("GL_EXT_framebuffer_multisample") &&
+                   ctxInfo.hasExtension("GL_EXT_framebuffer_blit")) {
+            fMSFBOType = GrGLCaps::kDesktopEXT_MSFBOType;
+        }
+    }
+}
+
+namespace {
+const GrGLuint kUnknownBitCount = GrGLStencilBuffer::kUnknownBitCount;
+}
+
+void GrGLCaps::initStencilFormats(const GrGLContextInfo& ctxInfo) {
+
+    // Build up list of legal stencil formats (though perhaps not supported on
+    // the particular gpu/driver) from most preferred to least.
+
+    // these consts are in order of most preferred to least preferred
+    // we don't bother with GL_STENCIL_INDEX1 or GL_DEPTH32F_STENCIL8
+
+    static const StencilFormat
+                  // internal Format      stencil bits      total bits        packed?
+        gS8    = {GR_GL_STENCIL_INDEX8,   8,                8,                false},
+        gS16   = {GR_GL_STENCIL_INDEX16,  16,               16,               false},
+        gD24S8 = {GR_GL_DEPTH24_STENCIL8, 8,                32,               true },
+        gS4    = {GR_GL_STENCIL_INDEX4,   4,                4,                false},
+        gS     = {GR_GL_STENCIL_INDEX,    kUnknownBitCount, kUnknownBitCount, false},
+        gDS    = {GR_GL_DEPTH_STENCIL,    kUnknownBitCount, kUnknownBitCount, true };
+
+    if (kDesktop_GrGLBinding == ctxInfo.binding()) {
+        bool supportsPackedDS =
+            ctxInfo.version() >= GR_GL_VER(3,0) || 
+            ctxInfo.hasExtension("GL_EXT_packed_depth_stencil") ||
+            ctxInfo.hasExtension("GL_ARB_framebuffer_object");
+
+        // S1 thru S16 formats are in GL 3.0+, EXT_FBO, and ARB_FBO since we
+        // require FBO support we can expect these are legal formats and don't
+        // check. These also all support the unsized GL_STENCIL_INDEX.
+        fStencilFormats.push_back() = gS8;
+        fStencilFormats.push_back() = gS16;
+        if (supportsPackedDS) {
+            fStencilFormats.push_back() = gD24S8;
+        }
+        fStencilFormats.push_back() = gS4;
+        if (supportsPackedDS) {
+            fStencilFormats.push_back() = gDS;
+        }
+    } else {
+        // ES2 has STENCIL_INDEX8 without extensions but requires extensions
+        // for other formats.
+        // ES doesn't support using the unsized format.
+
+        fStencilFormats.push_back() = gS8;
+        //fStencilFormats.push_back() = gS16;
+        if (ctxInfo.hasExtension("GL_OES_packed_depth_stencil")) {
+            fStencilFormats.push_back() = gD24S8;
+        }
+        if (ctxInfo.hasExtension("GL_OES_stencil4")) {
+            fStencilFormats.push_back() = gS4;
+        }
+    }
+    GrAssert(0 == fStencilVerifiedColorConfigs.count());
+    fStencilVerifiedColorConfigs.push_back_n(fStencilFormats.count());
+}
+
+void GrGLCaps::markColorConfigAndStencilFormatAsVerified(
+                                    GrPixelConfig config,
+                                    const GrGLStencilBuffer::Format& format) {
+#if !GR_GL_CHECK_FBO_STATUS_ONCE_PER_FORMAT
+    return;
+#endif
+    GrAssert((unsigned)config < kGrPixelConfigCount);
+    GrAssert(fStencilFormats.count() == fStencilVerifiedColorConfigs.count());
+    int count = fStencilFormats.count();
+    // we expect a really small number of possible formats so linear search
+    // should be OK
+    GrAssert(count < 16);
+    for (int i = 0; i < count; ++i) {
+        if (format.fInternalFormat ==
+            fStencilFormats[i].fInternalFormat) {
+            fStencilVerifiedColorConfigs[i].markVerified(config);
+            return;
+        }
+    }
+    GrCrash("Why are we seeing a stencil format that "
+            "GrGLCaps doesn't know about.");
+}
+
+bool GrGLCaps::isColorConfigAndStencilFormatVerified(
+                                GrPixelConfig config,
+                                const GrGLStencilBuffer::Format& format) const {
+#if !GR_GL_CHECK_FBO_STATUS_ONCE_PER_FORMAT
+    return false;
+#endif
+    GrAssert((unsigned)config < kGrPixelConfigCount);
+    int count = fStencilFormats.count();
+    // we expect a really small number of possible formats so linear search
+    // should be OK
+    GrAssert(count < 16);
+    for (int i = 0; i < count; ++i) {
+        if (format.fInternalFormat ==
+            fStencilFormats[i].fInternalFormat) {
+            return fStencilVerifiedColorConfigs[i].isVerified(config);
+        }
+    }
+    GrCrash("Why are we seeing a stencil format that "
+            "GLCaps doesn't know about.");
+    return false;
+}
+
+void GrGLCaps::print() const {
+    for (int i = 0; i < fStencilFormats.count(); ++i) {
+        GrPrintf("Stencil Format %d, stencil bits: %02d, total bits: %02d\n",
+                 i,
+                 fStencilFormats[i].fStencilBits,
+                 fStencilFormats[i].fTotalBits);
+    }
+
+    GR_STATIC_ASSERT(0 == kNone_MSFBOType);
+    GR_STATIC_ASSERT(1 == kDesktopARB_MSFBOType);
+    GR_STATIC_ASSERT(2 == kDesktopEXT_MSFBOType);
+    GR_STATIC_ASSERT(3 == kAppleES_MSFBOType);
+    static const char* gMSFBOExtStr[] = {
+        "None",
+        "ARB",
+        "EXT",
+        "Apple",
+    };
+    GrPrintf("MSAA Type: %s\n", gMSFBOExtStr[fMSFBOType]);
+    GrPrintf("Max FS Uniform Vectors: %d\n", fMaxFragmentUniformVectors);
+    GrPrintf("Support RGBA8 Render Buffer: %s\n",
+             (fRGBA8RenderbufferSupport ? "YES": "NO"));
+    GrPrintf("BGRA is an internal format: %s\n",
+             (fBGRAIsInternalFormat ? "YES": "NO"));
+    GrPrintf("Support texture swizzle: %s\n",
+             (fTextureSwizzleSupport ? "YES": "NO"));
+    GrPrintf("Unpack Row length support: %s\n",
+             (fUnpackRowLengthSupport ? "YES": "NO"));
+    GrPrintf("Unpack Flip Y support: %s\n",
+             (fUnpackFlipYSupport ? "YES": "NO"));
+    GrPrintf("Pack Row length support: %s\n",
+             (fPackRowLengthSupport ? "YES": "NO"));
+    GrPrintf("Pack Flip Y support: %s\n",
+             (fPackFlipYSupport ? "YES": "NO"));
+}
+
diff --git a/src/gpu/gl/GrGLCaps.h b/src/gpu/gl/GrGLCaps.h
new file mode 100644
index 0000000..a5318eb
--- /dev/null
+++ b/src/gpu/gl/GrGLCaps.h
@@ -0,0 +1,227 @@
+
+/*
+ * Copyright 2012 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+
+#ifndef GrGLCaps_DEFINED
+#define GrGLCaps_DEFINED
+
+#include "GrGLStencilBuffer.h"
+
+class GrGLContextInfo;
+
+/**
+ * Stores some capabilities of a GL context. Most are determined by the GL
+ * version and the extensions string. It also tracks formats that have passed
+ * the FBO completeness test.
+ */
+class GrGLCaps {
+public:
+    typedef GrGLStencilBuffer::Format StencilFormat;
+
+    /**
+     * The type of MSAA for FBOs supported. Different extensions have different
+     * semantics of how / when a resolve is performed.
+     */
+    enum MSFBOType {
+        /**
+         * no support for MSAA FBOs
+         */
+        kNone_MSFBOType = 0,  
+        /**
+         * GL3.0-style MSAA FBO (GL_ARB_framebuffer_object)
+         */
+        kDesktopARB_MSFBOType,
+        /**
+         * earlier GL_EXT_framebuffer* extensions
+         */
+        kDesktopEXT_MSFBOType,
+        /**
+         * GL_APPLE_framebuffer_multisample ES extension
+         */
+        kAppleES_MSFBOType,
+    };
+
+    /**
+     * Creates a GrGLCaps that advertises no support for any extensions,
+     * formats, etc. Call init to initialize from a GrGLContextInfo.
+     */
+    GrGLCaps();
+
+    GrGLCaps(const GrGLCaps& caps);
+
+    GrGLCaps& operator = (const GrGLCaps& caps);
+
+    /**
+     * Resets the caps such that nothing is supported.
+     */
+    void reset();
+
+    /**
+     * Initializes the GrGLCaps to the set of features supported in the current
+     * OpenGL context accessible via ctxInfo.
+     */
+    void init(const GrGLContextInfo& ctxInfo);
+
+    /**
+     * Call to note that a color config has been verified as a valid color
+     * attachment. This may save future calls to glCheckFramebufferStatus
+     * using isConfigVerifiedColorAttachment().
+     */
+    void markConfigAsValidColorAttachment(GrPixelConfig config) {
+        fVerifiedColorConfigs.markVerified(config);
+    }
+
+    /**
+     * Call to check whether a config has been verified as a valid color
+     * attachment.
+     */
+    bool isConfigVerifiedColorAttachment(GrPixelConfig config) const {
+        return fVerifiedColorConfigs.isVerified(config);
+    }
+
+    /**
+     * Call to note that a color config / stencil format pair passed
+     * FBO status check. We may skip calling glCheckFramebufferStatus for
+     * this combination in the future using
+     * isColorConfigAndStencilFormatVerified().
+     */
+    void markColorConfigAndStencilFormatAsVerified(
+                    GrPixelConfig config,
+                    const GrGLStencilBuffer::Format& format);
+
+    /**
+     * Call to check whether color config / stencil format pair has already
+     * passed FBO status check.
+     */
+    bool isColorConfigAndStencilFormatVerified(
+                    GrPixelConfig config,
+                    const GrGLStencilBuffer::Format& format) const;
+
+    /**
+     * Reports the type of MSAA FBO support.
+     */
+    MSFBOType msFBOType() const { return fMSFBOType; }
+
+    /**
+     * Prints the caps info using GrPrintf.
+     */
+    void print() const;
+
+    /**
+     * Gets an array of legal stencil formats. These formats are not guaranteed
+     * to be supported by the driver but are legal GLenum names given the GL
+     * version and extensions supported.
+     */
+    const SkTArray<StencilFormat, true>& stencilFormats() const {
+        return fStencilFormats;
+    }
+
+    /// The maximum number of fragment uniform vectors (GLES has min. 16).
+    int maxFragmentUniformVectors() const { return fMaxFragmentUniformVectors; }
+
+    /// ES requires an extension to support RGBA8 in RenderBufferStorage
+    bool rgba8RenderbufferSupport() const { return fRGBA8RenderbufferSupport; }
+
+    /// Is GL_BGRA supported
+    bool bgraFormatSupport() const { return fBGRAFormatSupport; }
+
+    /**
+     * Depending on the ES extensions present the BGRA external format may
+     * correspond either a BGRA or RGBA internalFormat. On desktop GL it is
+     * RGBA.
+     */
+    bool bgraIsInternalFormat() const { return fBGRAIsInternalFormat; }
+
+    /// GL_ARB_texture_swizzle support
+    bool textureSwizzleSupport() const { return fTextureSwizzleSupport; }
+
+    /// Is there support for GL_UNPACK_ROW_LENGTH
+    bool unpackRowLengthSupport() const { return fUnpackRowLengthSupport; }
+
+    /// Is there support for GL_UNPACK_FLIP_Y
+    bool unpackFlipYSupport() const { return fUnpackFlipYSupport; }
+
+    /// Is there support for GL_PACK_ROW_LENGTH
+    bool packRowLengthSupport() const { return fPackRowLengthSupport; }
+
+    /// Is there support for GL_PACK_REVERSE_ROW_ORDER
+    bool packFlipYSupport() const { return fPackFlipYSupport; }
+
+    /// Is there support for texture parameter GL_TEXTURE_USAGE
+    bool textureUsageSupport() const { return fTextureUsageSupport; }
+
+    /// Is there support for glTexStorage
+    bool texStorageSupport() const { return fTexStorageSupport; }
+
+private:
+    /**
+     * Maintains a bit per GrPixelConfig. It is used to avoid redundantly
+     * performing glCheckFrameBufferStatus for the same config.
+     */
+    struct VerifiedColorConfigs {
+        VerifiedColorConfigs() {
+            this->reset();
+        }
+
+        void reset() {
+            for (int i = 0; i < kNumUints; ++i) {
+                fVerifiedColorConfigs[i] = 0;
+            }
+        }
+
+        static const int kNumUints = (kGrPixelConfigCount  + 31) / 32;
+        uint32_t fVerifiedColorConfigs[kNumUints];
+
+        void markVerified(GrPixelConfig config) {
+#if !GR_GL_CHECK_FBO_STATUS_ONCE_PER_FORMAT
+                return;
+#endif
+            int u32Idx = config / 32;
+            int bitIdx = config % 32;
+            fVerifiedColorConfigs[u32Idx] |= 1 << bitIdx;
+        }
+
+        bool isVerified(GrPixelConfig config) const {
+#if !GR_GL_CHECK_FBO_STATUS_ONCE_PER_FORMAT
+            return false;
+#endif
+            int u32Idx = config / 32;
+            int bitIdx = config % 32;
+            return SkToBool(fVerifiedColorConfigs[u32Idx] & (1 << bitIdx));
+        }
+    };
+
+    void initFSAASupport(const GrGLContextInfo& ctxInfo);
+    void initStencilFormats(const GrGLContextInfo& ctxInfo);
+
+    // tracks configs that have been verified to pass the FBO completeness when
+    // used as a color attachment
+    VerifiedColorConfigs fVerifiedColorConfigs;
+
+    SkTArray<StencilFormat, true> fStencilFormats;
+    // tracks configs that have been verified to pass the FBO completeness when
+    // used as a color attachment when a particular stencil format is used
+    // as a stencil attachment.
+    SkTArray<VerifiedColorConfigs, true> fStencilVerifiedColorConfigs;
+
+    int fMaxFragmentUniformVectors;
+    MSFBOType fMSFBOType;
+
+    bool fRGBA8RenderbufferSupport : 1;
+    bool fBGRAFormatSupport : 1;
+    bool fBGRAIsInternalFormat : 1;
+    bool fTextureSwizzleSupport : 1;
+    bool fUnpackRowLengthSupport : 1;
+    bool fUnpackFlipYSupport : 1;
+    bool fPackRowLengthSupport : 1;
+    bool fPackFlipYSupport : 1;
+    bool fTextureUsageSupport : 1;
+    bool fTexStorageSupport : 1;
+};
+
+#endif
diff --git a/src/gpu/gl/GrGLContextInfo.cpp b/src/gpu/gl/GrGLContextInfo.cpp
new file mode 100644
index 0000000..33e19ab
--- /dev/null
+++ b/src/gpu/gl/GrGLContextInfo.cpp
@@ -0,0 +1,85 @@
+
+/*
+ * Copyright 2012 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "GrGLContextInfo.h"
+
+GrGLContextInfo::~GrGLContextInfo() {
+    GrSafeUnref(fInterface);
+}
+
+GrGLContextInfo::GrGLContextInfo() {
+    this->reset();
+}
+
+GrGLContextInfo::GrGLContextInfo(const GrGLInterface* interface) {
+    fInterface = NULL;
+    this->initialize(interface);
+}
+
+GrGLContextInfo::GrGLContextInfo(const GrGLContextInfo& ctx) {
+    fInterface = NULL;
+    *this = ctx;
+}
+
+GrGLContextInfo& GrGLContextInfo::operator = (const GrGLContextInfo& ctx) {
+    GrSafeAssign(fInterface, ctx.fInterface);
+    fBindingInUse = ctx.fBindingInUse;
+    fGLVersion = ctx.fGLVersion;
+    fGLSLGeneration = ctx.fGLSLGeneration;
+    fExtensionString = ctx.fExtensionString;
+    fGLCaps = ctx.fGLCaps;
+    return *this;
+}
+
+void GrGLContextInfo::reset() {
+    GrSafeSetNull(fInterface);
+    fBindingInUse = kNone_GrGLBinding;
+    fGLVersion = GR_GL_VER(0, 0);
+    fGLSLGeneration = static_cast<GrGLSLGeneration>(0);
+    fExtensionString = "";
+    fGLCaps.reset();
+}
+
+bool GrGLContextInfo::initialize(const GrGLInterface* interface) {
+    this->reset();
+    // We haven't validated the GrGLInterface yet, so check for GetString
+    // function pointer
+    if (NULL != interface->fGetString) {
+
+        const GrGLubyte* verUByte;
+        GR_GL_CALL_RET(interface, verUByte, GetString(GR_GL_VERSION));
+        const char* ver = reinterpret_cast<const char*>(verUByte);
+        GrGLBinding binding = GrGLGetBindingInUseFromString(ver);
+
+        if (!interface->validate(fBindingInUse)) {
+
+            fInterface = interface;
+            interface->ref();
+
+            fBindingInUse = binding;
+
+            fGLVersion = GrGLGetVersionFromString(ver);
+
+            fGLSLGeneration = GrGetGLSLGeneration(fBindingInUse,
+                                                  this->interface());
+
+            const GrGLubyte* ext;
+            GR_GL_CALL_RET(interface, ext, GetString(GR_GL_EXTENSIONS));
+            fExtensionString = reinterpret_cast<const char*>(ext);
+
+            fGLCaps.init(*this);
+            return true;
+        }
+    }
+    return false;
+}
+
+bool GrGLContextInfo::isInitialized() const {
+    return kNone_GrGLBinding != fBindingInUse;
+}
+
diff --git a/src/gpu/gl/GrGLContextInfo.h b/src/gpu/gl/GrGLContextInfo.h
new file mode 100644
index 0000000..c37b11d
--- /dev/null
+++ b/src/gpu/gl/GrGLContextInfo.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2012 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+
+#ifndef GrGLContextInfo_DEFINED
+#define GrGLContextInfo_DEFINED
+
+#include "GrGLCaps.h"
+#include "gl/GrGLInterface.h"
+#include "GrGLSL.h"
+
+#include "SkString.h"
+
+/**
+ * Encapsulates information about an OpenGL context including the GrGLInterface
+ * used to make GL calls, the OpenGL version, the GrGLBinding type of the
+ * context, and GLSL version.
+ */
+class GrGLContextInfo {
+public:
+
+    /**
+     * Default constructor, creates an uninitialized GrGLContextInfo
+     */
+    GrGLContextInfo();
+
+    /**
+     * Creates a GrGLContextInfo from a GrGLInterface and the currently
+     * bound OpenGL context accesible by the GrGLInterface.
+     */
+    explicit GrGLContextInfo(const GrGLInterface* interface);
+
+    /**
+     * Copies a GrGLContextInfo
+     */
+    GrGLContextInfo(const GrGLContextInfo& ctx);
+
+    ~GrGLContextInfo();
+
+    /**
+     * Copies a GrGLContextInfo
+     */
+    GrGLContextInfo& operator = (const GrGLContextInfo& ctx);
+
+    /**
+     * Initializes a GrGLContextInfo from a GrGLInterface and the currently
+     * bound OpenGL context accessible by the GrGLInterface.
+     */
+    bool initialize(const GrGLInterface* interface);
+    bool isInitialized() const;
+
+    const GrGLInterface* interface() const { return fInterface; }
+    GrGLBinding binding() const { return fBindingInUse; }
+    GrGLVersion version() const { return fGLVersion; }
+    GrGLSLGeneration glslGeneration() const { return fGLSLGeneration; }
+    const GrGLCaps& caps() const { return fGLCaps; }
+    GrGLCaps& caps() { return fGLCaps; }
+
+    /**
+     * Checks for extension support using a cached copy of the GL_EXTENSIONS
+     * string.
+     */
+    bool hasExtension(const char* ext) const {
+        if (!this->isInitialized()) {
+            return false;
+        }
+        return GrGLHasExtensionFromString(ext, fExtensionString.c_str());
+    }
+
+private:
+    void reset();
+
+    const GrGLInterface* fInterface;
+    GrGLBinding          fBindingInUse;
+    GrGLVersion          fGLVersion;
+    GrGLSLGeneration     fGLSLGeneration;
+    SkString             fExtensionString;
+    GrGLCaps             fGLCaps;
+};
+
+#endif
diff --git a/src/gpu/GrGLCreateNativeInterface_none.cpp b/src/gpu/gl/GrGLCreateNativeInterface_none.cpp
similarity index 87%
rename from src/gpu/GrGLCreateNativeInterface_none.cpp
rename to src/gpu/gl/GrGLCreateNativeInterface_none.cpp
index 7de5912..914ed51 100644
--- a/src/gpu/GrGLCreateNativeInterface_none.cpp
+++ b/src/gpu/gl/GrGLCreateNativeInterface_none.cpp
@@ -6,7 +6,7 @@
  * found in the LICENSE file.
  */
 
-#include "GrGLInterface.h"
+#include "gl/GrGLInterface.h"
 
 const GrGLInterface* GrGLCreateNativeInterface() {
     return NULL;
diff --git a/src/gpu/GrGLCreateNullInterface.cpp b/src/gpu/gl/GrGLCreateNullInterface.cpp
similarity index 99%
rename from src/gpu/GrGLCreateNullInterface.cpp
rename to src/gpu/gl/GrGLCreateNullInterface.cpp
index d4a88c1..5095079 100644
--- a/src/gpu/GrGLCreateNullInterface.cpp
+++ b/src/gpu/gl/GrGLCreateNullInterface.cpp
@@ -7,7 +7,8 @@
  */
 
 
-#include "GrGLInterface.h"
+#include "gl/GrGLInterface.h"
+#include "../GrTDArray.h"
 
 GrGLvoid GR_GL_FUNCTION_TYPE nullGLActiveTexture(GrGLenum texture) {}
 GrGLvoid GR_GL_FUNCTION_TYPE nullGLAttachShader(GrGLuint program, GrGLuint shader) {}
@@ -124,7 +125,6 @@
 
 // In debug builds we do asserts that ensure we agree with GL about when a buffer
 // is mapped.
-#include "GrTDArray.h"
 static GrTDArray<GrGLuint> gMappedBuffers;
 static GrGLuint gCurrArrayBuffer;
 static GrGLuint gCurrElementArrayBuffer;
diff --git a/src/gpu/GrGLDefaultInterface_native.cpp b/src/gpu/gl/GrGLDefaultInterface_native.cpp
similarity index 88%
rename from src/gpu/GrGLDefaultInterface_native.cpp
rename to src/gpu/gl/GrGLDefaultInterface_native.cpp
index 7b8b84a..13988c0 100644
--- a/src/gpu/GrGLDefaultInterface_native.cpp
+++ b/src/gpu/gl/GrGLDefaultInterface_native.cpp
@@ -6,7 +6,7 @@
  * found in the LICENSE file.
  */
 
-#include "GrGLInterface.h"
+#include "gl/GrGLInterface.h"
 
 const GrGLInterface* GrGLDefaultInterface() {
     return GrGLCreateNativeInterface();
diff --git a/src/gpu/GrGLDefaultInterface_none.cpp b/src/gpu/gl/GrGLDefaultInterface_none.cpp
similarity index 87%
rename from src/gpu/GrGLDefaultInterface_none.cpp
rename to src/gpu/gl/GrGLDefaultInterface_none.cpp
index 2cca135..183c477 100644
--- a/src/gpu/GrGLDefaultInterface_none.cpp
+++ b/src/gpu/gl/GrGLDefaultInterface_none.cpp
@@ -6,7 +6,7 @@
  * found in the LICENSE file.
  */
 
-#include "GrGLInterface.h"
+#include "gl/GrGLInterface.h"
 
 const GrGLInterface* GrGLDefaultInterface() {
     return NULL;
diff --git a/src/gpu/GrGLIRect.h b/src/gpu/gl/GrGLIRect.h
similarity index 98%
rename from src/gpu/GrGLIRect.h
rename to src/gpu/gl/GrGLIRect.h
index e94fa21..aee5fb4 100644
--- a/src/gpu/GrGLIRect.h
+++ b/src/gpu/gl/GrGLIRect.h
@@ -11,7 +11,7 @@
 #ifndef GrGLIRect_DEFINED
 #define GrGLIRect_DEFINED
 
-#include "GrGLInterface.h"
+#include "gl/GrGLInterface.h"
 
 /**
  * Helper struct for dealing with the fact that Ganesh and GL use different
diff --git a/src/gpu/GrGLIndexBuffer.cpp b/src/gpu/gl/GrGLIndexBuffer.cpp
similarity index 89%
rename from src/gpu/GrGLIndexBuffer.cpp
rename to src/gpu/gl/GrGLIndexBuffer.cpp
index b64668e..60bd9a9 100644
--- a/src/gpu/GrGLIndexBuffer.cpp
+++ b/src/gpu/gl/GrGLIndexBuffer.cpp
@@ -104,27 +104,29 @@
     }
     this->bind();
     GrGLenum usage = dynamic() ? GR_GL_DYNAMIC_DRAW : GR_GL_STATIC_DRAW;
-#if !GR_GL_USE_BUFFER_DATA_NULL_HINT
-    // Note that we're cheating on the size here. Currently no methods
-    // allow a partial update that preserves contents of non-updated
-    // portions of the buffer (and lock() does a glBufferData(..size, NULL..))
-    GL_CALL(BufferData(GR_GL_ELEMENT_ARRAY_BUFFER, srcSizeInBytes, src, usage));
-#else
+
+#if GR_GL_USE_BUFFER_DATA_NULL_HINT
     if (this->sizeInBytes() == srcSizeInBytes) {
         GL_CALL(BufferData(GR_GL_ELEMENT_ARRAY_BUFFER,
                             srcSizeInBytes, src, usage));
     } else {
         // Before we call glBufferSubData we give the driver a hint using
         // glBufferData with NULL. This makes the old buffer contents
-        // inaccessible to future draws. The GPU may still be processing draws
-        // that reference the old contents. With this hint it can assign a
-        // different allocation for the new contents to avoid flushing the gpu
-        // past draws consuming the old contents.
+        // inaccessible to future draws. The GPU may still be processing
+        // draws that reference the old contents. With this hint it can
+        // assign a different allocation for the new contents to avoid
+        // flushing the gpu past draws consuming the old contents.
         GL_CALL(BufferData(GR_GL_ELEMENT_ARRAY_BUFFER,
                            this->sizeInBytes(), NULL, usage));
         GL_CALL(BufferSubData(GR_GL_ELEMENT_ARRAY_BUFFER,
                               0, srcSizeInBytes, src));
     }
+#else
+    // Note that we're cheating on the size here. Currently no methods
+    // allow a partial update that preserves contents of non-updated
+    // portions of the buffer (lock() does a glBufferData(..size, NULL..))
+    GL_CALL(BufferData(GR_GL_ELEMENT_ARRAY_BUFFER, 
+                       srcSizeInBytes, src, usage));
 #endif
     return true;
 }
diff --git a/src/gpu/GrGLIndexBuffer.h b/src/gpu/gl/GrGLIndexBuffer.h
similarity index 94%
rename from src/gpu/GrGLIndexBuffer.h
rename to src/gpu/gl/GrGLIndexBuffer.h
index c3e2287..68c165f 100644
--- a/src/gpu/GrGLIndexBuffer.h
+++ b/src/gpu/gl/GrGLIndexBuffer.h
@@ -11,8 +11,8 @@
 #ifndef GrGLIndexBuffer_DEFINED
 #define GrGLIndexBuffer_DEFINED
 
-#include "GrIndexBuffer.h"
-#include "GrGLInterface.h"
+#include "../GrIndexBuffer.h"
+#include "gl/GrGLInterface.h"
 
 class GrGpuGL;
 
diff --git a/src/gpu/GrGLInterface.cpp b/src/gpu/gl/GrGLInterface.cpp
similarity index 89%
rename from src/gpu/GrGLInterface.cpp
rename to src/gpu/gl/GrGLInterface.cpp
index 3ce9ab0..6475306 100644
--- a/src/gpu/GrGLInterface.cpp
+++ b/src/gpu/gl/GrGLInterface.cpp
@@ -8,8 +8,8 @@
 
 
 #include "GrTypes.h"
-#include "GrGLInterface.h"
-#include "GrGLDefines.h"
+#include "gl/GrGLInterface.h"
+#include "gl/GrGLDefines.h"
 
 #include <stdio.h>
 
@@ -19,6 +19,37 @@
 }
 #endif
 
+GrGLBinding GrGLGetBindingInUseFromString(const char* versionString) {
+    if (NULL == versionString) {
+        GrAssert(!"NULL GL version string.");
+        return kNone_GrGLBinding;
+    }
+
+    int major, minor;
+
+    // check for desktop
+    int n = sscanf(versionString, "%d.%d", &major, &minor);
+    if (2 == n) {
+        return kDesktop_GrGLBinding;
+    }
+
+    // check for ES 1
+    char profile[2];
+    n = sscanf(versionString, "OpenGL ES-%c%c %d.%d", profile, profile+1,
+               &major, &minor);
+    if (4 == n) {
+        // we no longer support ES1.
+        return kNone_GrGLBinding;
+    }
+
+    // check for ES2
+    n = sscanf(versionString, "OpenGL ES %d.%d", &major, &minor);
+    if (2 == n) {
+        return kES2_GrGLBinding;
+    }
+    return kNone_GrGLBinding;
+}
+
 GrGLVersion GrGLGetVersionFromString(const char* versionString) {
     if (NULL == versionString) {
         GrAssert(!"NULL GL version string.");
@@ -65,10 +96,12 @@
         return GR_GLSL_VER(major, minor);
     }
 
+    // android hack
     n = sscanf(versionString, "OpenGL ES GLSL %d.%d", &major, &minor);
     if (2 == n) {
         return GR_GLSL_VER(major, minor);
     }
+
     return 0;
 }
 
@@ -95,6 +128,12 @@
     return GrGLHasExtensionFromString(ext, (const char*) glstr);
 }
 
+GrGLBinding GrGLGetBindingInUse(const GrGLInterface* gl) {
+    const GrGLubyte* v;
+    GR_GL_CALL_RET(gl, v, GetString(GR_GL_VERSION));
+    return GrGLGetBindingInUseFromString((const char*) v);
+}
+
 GrGLVersion GrGLGetVersion(const GrGLInterface* gl) {
     const GrGLubyte* v;
     GR_GL_CALL_RET(gl, v, GetString(GR_GL_VERSION));
@@ -108,7 +147,7 @@
 }
 
 GrGLInterface::GrGLInterface() {
-    fBindingsExported = (GrGLBinding)0;
+    fBindingsExported = kNone_GrGLBinding;
 
     fActiveTexture = NULL;
     fAttachShader = NULL;
@@ -233,14 +272,13 @@
 #endif
 }
 
-bool GrGLInterface::validate() const {
+bool GrGLInterface::validate(GrGLBinding binding) const {
 
-    bool isDesktop = this->supportsDesktop();
+    // kNone must be 0 so that the check we're about to do can never succeed if
+    // binding == kNone.
+    GR_STATIC_ASSERT(kNone_GrGLBinding == 0);
 
-    bool isES2 = this->supportsES2();
-    
-    if (isDesktop == isES2) {
-        // must have one, don't support both in same interface
+    if (0 == (binding & fBindingsExported)) {
         return false;
     }
 
@@ -346,14 +384,14 @@
     // these functions are part of ES2, we assume they are available
     // On the desktop we assume they are available if the extension
     // is present or GL version is high enough.
-    if ((kES2_GrGLBinding & fBindingsExported)) {
+    if (kES2_GrGLBinding == binding) {
         if (NULL == fBlendColor ||
             NULL == fStencilFuncSeparate ||
             NULL == fStencilMaskSeparate ||
             NULL == fStencilOpSeparate) {
             return false;
         }
-    } else if (kDesktop_GrGLBinding == fBindingsExported) {
+    } else if (kDesktop_GrGLBinding == binding) {
         if (glVer >= GR_GL_VER(2,0)) {
             if (NULL == fStencilFuncSeparate ||
                 NULL == fStencilMaskSeparate ||
@@ -405,7 +443,7 @@
     }
 
     // optional function on desktop before 1.3
-    if (kDesktop_GrGLBinding != fBindingsExported ||
+    if (kDesktop_GrGLBinding != binding ||
         (glVer >= GR_GL_VER(1,3) ||
         GrGLHasExtensionFromString("GL_ARB_texture_compression", ext))) {
         if (NULL == fCompressedTexImage2D) {
@@ -414,7 +452,7 @@
     }
 
     // part of desktop GL, but not ES
-    if (kDesktop_GrGLBinding == fBindingsExported &&
+    if (kDesktop_GrGLBinding == binding &&
         (NULL == fLineWidth ||
          NULL == fGetTexLevelParameteriv ||
          NULL == fDrawBuffer ||
@@ -424,7 +462,7 @@
 
     // GL_EXT_texture_storage is part of desktop 4.2
     // There is a desktop ARB extension and an ES+desktop EXT extension
-    if (kDesktop_GrGLBinding == fBindingsExported) {
+    if (kDesktop_GrGLBinding == binding) {
         if (glVer >= GR_GL_VER(4,2) ||
             GrGLHasExtensionFromString("GL_ARB_texture_storage", ext) ||
             GrGLHasExtensionFromString("GL_EXT_texture_storage", ext)) {
@@ -439,7 +477,7 @@
     }
 
     // FBO MSAA
-    if (kDesktop_GrGLBinding == fBindingsExported) {
+    if (kDesktop_GrGLBinding == binding) {
         // GL 3.0 and the ARB extension have multisample + blit
         if (glVer >= GR_GL_VER(3,0) || GrGLHasExtensionFromString("GL_ARB_framebuffer_object", ext)) {
             if (NULL == fRenderbufferStorageMultisample ||
@@ -474,7 +512,7 @@
     // On ES buffer mapping is an extension. On Desktop
     // buffer mapping was part of original VBO extension
     // which we require.
-    if (kDesktop_GrGLBinding == fBindingsExported  || 
+    if (kDesktop_GrGLBinding == binding || 
         GrGLHasExtensionFromString("GL_OES_mapbuffer", ext)) {
         if (NULL == fMapBuffer ||
             NULL == fUnmapBuffer) {
@@ -483,7 +521,7 @@
     }
 
     // Dual source blending
-    if (kDesktop_GrGLBinding == fBindingsExported  &&
+    if (kDesktop_GrGLBinding == binding &&
         (glVer >= GR_GL_VER(3,3) || 
          GrGLHasExtensionFromString("GL_ARB_blend_func_extended", ext))) {
         if (NULL == fBindFragDataLocationIndexed) {
diff --git a/src/gpu/GrGLProgram.cpp b/src/gpu/gl/GrGLProgram.cpp
similarity index 76%
rename from src/gpu/GrGLProgram.cpp
rename to src/gpu/gl/GrGLProgram.cpp
index 2e391e3..2925213 100644
--- a/src/gpu/GrGLProgram.cpp
+++ b/src/gpu/gl/GrGLProgram.cpp
@@ -9,7 +9,7 @@
 
 #include "GrGLProgram.h"
 
-#include "GrAllocator.h"
+#include "../GrAllocator.h"
 #include "GrGLShaderVar.h"
 #include "SkTrace.h"
 #include "SkXfermode.h"
@@ -22,23 +22,6 @@
     kUseUniform = 2000
 };
 
-
-const char* GrPrecision(const GrGLInterface* gl) {
-    if (gl->supportsES2()) {
-        return "mediump";
-    } else {
-        return " ";
-    }
-}
-
-const char* GrShaderPrecision(const GrGLInterface* gl) {
-    if (gl->supportsES2()) {
-        return "precision mediump float;\n";
-    } else {
-        return "";
-    }
-}
-
 }  // namespace
 
 #define PRINT_SHADERS 0
@@ -92,6 +75,7 @@
 #define COV_ATTR_NAME "aCoverage"
 #define EDGE_ATTR_NAME "aEdge"
 #define COL_UNI_NAME "uColor"
+#define COV_UNI_NAME "uCoverage"
 #define EDGES_UNI_NAME "uEdges"
 #define COL_FILTER_UNI_NAME "uColorFilter"
 #define COL_MATRIX_UNI_NAME "uColorMatrix"
@@ -182,6 +166,11 @@
     i->appendS32(stage);
 }
 
+inline void image_increment_param_name(int stage, GrStringBuilder* i) {
+    *i = "uImageIncrement";
+    i->appendS32(stage);
+}
+
 inline void tex_domain_name(int stage, GrStringBuilder* s) {
     *s = "uTexDom";
     s->appendS32(stage);
@@ -379,29 +368,6 @@
 
 namespace {
 
-const char* glsl_version_string(const GrGLInterface* gl,
-                                GrGLSLGeneration v) {
-    switch (v) {
-        case k110_GLSLGeneration:
-            if (gl->supportsES2()) {
-                // ES2s shader language is based on version 1.20 but is version
-                // 1.00 of the ES language.
-                return "#version 100\n";
-            } else {
-                return "#version 110\n";
-            }
-        case k130_GLSLGeneration:
-            GrAssert(!gl->supportsES2());
-            return "#version 130\n";
-        case k150_GLSLGeneration:
-            GrAssert(!gl->supportsES2());
-            return "#version 150\n";
-        default:
-            GrCrash("Unknown GL version.");
-            return ""; // suppress warning
-    }
-}
-
 // Adds a var that is computed in the VS and read in FS.
 // If there is a GS it will just pass it through.
 void append_varying(GrGLShaderVar::Type type,
@@ -462,7 +428,7 @@
 }
 }
 
-void GrGLProgram::genEdgeCoverage(const GrGLInterface* gl,
+void GrGLProgram::genEdgeCoverage(const GrGLContextInfo& gl,
                                   GrVertexLayout layout,
                                   CachedData* programData,
                                   GrStringBuilder* coverageVar,
@@ -526,21 +492,39 @@
         segments->fVSCode.appendf("\t%s = " EDGE_ATTR_NAME ";\n", vsName);
         if (GrDrawState::kHairLine_EdgeType == fProgramDesc.fVertexEdgeType) {
             segments->fFSCode.appendf("\tfloat edgeAlpha = abs(dot(vec3(gl_FragCoord.xy,1), %s.xyz));\n", fsName);
-        } else {
-            GrAssert(GrDrawState::kHairQuad_EdgeType == fProgramDesc.fVertexEdgeType);
-            // for now we know we're not in perspective, so we could compute this
-            // per-quadratic rather than per pixel
+            segments->fFSCode.append("\tedgeAlpha = max(1.0 - edgeAlpha, 0.0);\n");
+        } else if (GrDrawState::kQuad_EdgeType == fProgramDesc.fVertexEdgeType) {
+            segments->fFSCode.append("\tfloat edgeAlpha;\n");
+            // keep the derivative instructions outside the conditional 
             segments->fFSCode.appendf("\tvec2 duvdx = dFdx(%s.xy);\n", fsName);
             segments->fFSCode.appendf("\tvec2 duvdy = dFdy(%s.xy);\n", fsName);
-            segments->fFSCode.appendf("\tfloat dfdx = 2.0*%s.x*duvdx.x - duvdx.y;\n", fsName);
-            segments->fFSCode.appendf("\tfloat dfdy = 2.0*%s.x*duvdy.x - duvdy.y;\n", fsName);
+            segments->fFSCode.appendf("\tif (%s.z > 0.0 && %s.w > 0.0) {\n", fsName, fsName);
+            // today we know z and w are in device space. We could use derivatives
+            segments->fFSCode.appendf("\t\tedgeAlpha = min(min(%s.z, %s.w) + 0.5, 1.0);\n", fsName, fsName);
+            segments->fFSCode.append ("\t} else {\n");
+            segments->fFSCode.appendf("\t\tvec2 gF = vec2(2.0*%s.x*duvdx.x - duvdx.y,\n"
+                                      "\t\t               2.0*%s.x*duvdy.x - duvdy.y);\n",
+                                      fsName, fsName);
+            segments->fFSCode.appendf("\t\tedgeAlpha = (%s.x*%s.x - %s.y);\n", fsName, fsName, fsName);
+            segments->fFSCode.append("\t\tedgeAlpha = clamp(0.5 - edgeAlpha / length(gF), 0.0, 1.0);\n"
+                                      "\t}\n");
+            if (kES2_GrGLBinding == gl.binding()) {
+                segments->fHeader.printf("#extension GL_OES_standard_derivatives: enable\n");
+            }
+        } else {
+            GrAssert(GrDrawState::kHairQuad_EdgeType == fProgramDesc.fVertexEdgeType);
+            segments->fFSCode.appendf("\tvec2 duvdx = dFdx(%s.xy);\n", fsName);
+            segments->fFSCode.appendf("\tvec2 duvdy = dFdy(%s.xy);\n", fsName);
+            segments->fFSCode.appendf("\tvec2 gF = vec2(2.0*%s.x*duvdx.x - duvdx.y,\n"
+                                      "\t               2.0*%s.x*duvdy.x - duvdy.y);\n",
+                                      fsName, fsName);
             segments->fFSCode.appendf("\tfloat edgeAlpha = (%s.x*%s.x - %s.y);\n", fsName, fsName, fsName);
-            segments->fFSCode.append("\tedgeAlpha = sqrt(edgeAlpha*edgeAlpha / (dfdx*dfdx + dfdy*dfdy));\n");
-            if (gl->supportsES2()) {
+            segments->fFSCode.append("\tedgeAlpha = sqrt(edgeAlpha*edgeAlpha / dot(gF, gF));\n");
+            segments->fFSCode.append("\tedgeAlpha = max(1.0 - edgeAlpha, 0.0);\n");
+            if (kES2_GrGLBinding == gl.binding()) {
                 segments->fHeader.printf("#extension GL_OES_standard_derivatives: enable\n");
             }
         }
-        segments->fFSCode.append("\tedgeAlpha = max(1.0 - edgeAlpha, 0.0);\n");
         *coverageVar = "edgeAlpha";
     } else {
         coverageVar->reset();
@@ -549,29 +533,6 @@
 
 namespace {
 
-// returns true if the color output was explicitly declared or not.
-bool decl_and_get_fs_color_output(GrGLSLGeneration v,
-                                  VarArray* fsOutputs,
-                                  const char** name) {
-    switch (v) {
-        case k110_GLSLGeneration:
-            *name = "gl_FragColor";
-            return false;
-            break;
-        case k130_GLSLGeneration: // fallthru
-        case k150_GLSLGeneration:
-            *name = declared_color_output_name();
-            fsOutputs->push_back().set(GrGLShaderVar::kVec4f_Type,
-                                       GrGLShaderVar::kOut_TypeModifier,
-                                       declared_color_output_name());
-            return true;
-            break;
-        default:
-            GrCrash("Unknown GLSL version.");
-            return false; // suppress warning
-    }
-}
-
 void genInputColor(GrGLProgram::ProgramDesc::ColorInput colorInput,
                    GrGLProgram::CachedData* programData,
                    ShaderCodeSegments* segments,
@@ -604,32 +565,47 @@
     }
 }
 
-void genPerVertexCoverage(ShaderCodeSegments* segments,
-                          GrStringBuilder* inCoverage) {
-    segments->fVSAttrs.push_back().set(GrGLShaderVar::kFloat_Type,
+void genAttributeCoverage(ShaderCodeSegments* segments,
+                          GrStringBuilder* inOutCoverage) {
+    segments->fVSAttrs.push_back().set(GrGLShaderVar::kVec4f_Type,
                                        GrGLShaderVar::kAttribute_TypeModifier,
                                        COV_ATTR_NAME);
     const char *vsName, *fsName;
-    append_varying(GrGLShaderVar::kFloat_Type, "Coverage", 
+    append_varying(GrGLShaderVar::kVec4f_Type, "Coverage", 
                    segments, &vsName, &fsName);
     segments->fVSCode.appendf("\t%s = " COV_ATTR_NAME ";\n", vsName);
-    if (inCoverage->size()) {
-        segments->fFSCode.appendf("\tfloat edgeAndAttrCov = %s * %s;\n",
-                                  fsName, inCoverage->c_str());
-        *inCoverage = "edgeAndAttrCov";
+    if (inOutCoverage->size()) {
+        segments->fFSCode.appendf("\tvec4 attrCoverage = %s * %s;\n",
+                                  fsName, inOutCoverage->c_str());
+        *inOutCoverage = "attrCoverage";
     } else {
-        *inCoverage = fsName;
+        *inOutCoverage = fsName;
+    }
+}
+    
+void genUniformCoverage(ShaderCodeSegments* segments,
+                        GrGLProgram::CachedData* programData,
+                        GrStringBuilder* inOutCoverage) {
+    segments->fFSUnis.push_back().set(GrGLShaderVar::kVec4f_Type,
+                                      GrGLShaderVar::kUniform_TypeModifier,
+                                      COV_UNI_NAME);
+    programData->fUniLocations.fCoverageUni = kUseUniform;
+    if (inOutCoverage->size()) {
+        segments->fFSCode.appendf("\tvec4 uniCoverage = %s * %s;\n",
+                                  COV_UNI_NAME, inOutCoverage->c_str());
+        *inOutCoverage = "uniCoverage";
+    } else {
+        *inOutCoverage = COV_UNI_NAME;
     }
 }
 
 }
 
-void GrGLProgram::genGeometryShader(const GrGLInterface* gl,
-                                    GrGLSLGeneration glslGeneration,
+void GrGLProgram::genGeometryShader(const GrGLContextInfo& gl,
                                     ShaderCodeSegments* segments) const {
 #if GR_GL_EXPERIMENTAL_GS
     if (fProgramDesc.fExperimentalGS) {
-        GrAssert(glslGeneration >= k150_GLSLGeneration);
+        GrAssert(gl.glslGeneration() >= k150_GrGLSLGeneration);
         segments->fGSHeader.append("layout(triangles) in;\n"
                                    "layout(triangle_strip, max_vertices = 6) out;\n");
         segments->fGSCode.append("void main() {\n"
@@ -654,7 +630,6 @@
 }
 
 const char* GrGLProgram::adjustInColor(const GrStringBuilder& inColor) const {
-    const char* color;
     if (inColor.size()) {
           return inColor.c_str();
     } else {
@@ -666,8 +641,8 @@
     }
 }
 
-bool GrGLProgram::genProgram(const GrGLInterface* gl,
-                             GrGLSLGeneration glslGeneration,
+
+bool GrGLProgram::genProgram(const GrGLContextInfo& gl,
                              GrGLProgram::CachedData* programData) const {
 
     ShaderCodeSegments segments;
@@ -680,6 +655,7 @@
 #endif
 
     SkXfermode::Coeff colorCoeff, uniformCoeff;
+    bool applyColorMatrix = SkToBool(fProgramDesc.fColorMatrixEnabled);
     // The rest of transfer mode color filters have not been implemented
     if (fProgramDesc.fColorFilterXfermode < SkXfermode::kCoeffModesCnt) {
         GR_DEBUGCODE(bool success =)
@@ -692,6 +668,15 @@
         uniformCoeff = SkXfermode::kZero_Coeff;
     }
 
+    // no need to do the color filter / matrix at all if coverage is 0. The
+    // output color is scaled by the coverage. All the dual source outputs are
+    // scaled by the coverage as well.
+    if (ProgramDesc::kTransBlack_ColorInput == fProgramDesc.fCoverageInput) {
+        colorCoeff = SkXfermode::kZero_Coeff;
+        uniformCoeff = SkXfermode::kZero_Coeff;
+        applyColorMatrix = false;
+    }
+
     // If we know the final color is going to be all zeros then we can
     // simplify the color filter coeffecients. needComputedColor will then
     // come out false below.
@@ -713,12 +698,17 @@
 
     // the dual source output has no canonical var name, have to
     // declare an output, which is incompatible with gl_FragColor/gl_FragData.
-    const char* fsColorOutput = NULL;
     bool dualSourceOutputWritten = false;
-    segments.fHeader.printf(glsl_version_string(gl, glslGeneration));
-    bool isColorDeclared = decl_and_get_fs_color_output(glslGeneration,
-                                                        &segments.fFSOutputs,
-                                                        &fsColorOutput);
+    segments.fHeader.printf(GrGetGLSLVersionDecl(gl.binding(),
+                                                 gl.glslGeneration()));
+
+    GrGLShaderVar colorOutput;
+    bool isColorDeclared = GrGLSLSetupFSColorOuput(gl.glslGeneration(),
+                                                   declared_color_output_name(),
+                                                   &colorOutput);
+    if (isColorDeclared) {
+        segments.fFSOutputs.push_back(colorOutput);
+    }
 
 #if GR_GL_ATTRIBUTE_MATRICES
     segments.fVSAttrs.push_back().set(GrGLShaderVar::kMat33f_Type,
@@ -825,19 +815,19 @@
     bool wroteFragColorZero = false;
     if (SkXfermode::kZero_Coeff == uniformCoeff &&
         SkXfermode::kZero_Coeff == colorCoeff &&
-        !fProgramDesc.fColorMatrixEnabled) {
+        !applyColorMatrix) {
         segments.fFSCode.appendf("\t%s = %s;\n",
-                                 fsColorOutput,
+                                 colorOutput.getName().c_str(),
                                  all_zeros_vec(4));
         wroteFragColorZero = true;
     } else if (SkXfermode::kDst_Mode != fProgramDesc.fColorFilterXfermode) {
-        segments.fFSCode.appendf("\tvec4 filteredColor;\n");
+        segments.fFSCode.append("\tvec4 filteredColor;\n");
         const char* color = adjustInColor(inColor);
         addColorFilter(&segments.fFSCode, "filteredColor", uniformCoeff,
                        colorCoeff, color);
         inColor = "filteredColor";
     }
-    if (fProgramDesc.fColorMatrixEnabled) {
+    if (applyColorMatrix) {
         segments.fFSUnis.push_back().set(GrGLShaderVar::kMat44f_Type,
                                          GrGLShaderVar::kUniform_TypeModifier,
                                          COL_MATRIX_UNI_NAME);
@@ -846,7 +836,7 @@
                                          COL_MATRIX_VEC_UNI_NAME);
         programData->fUniLocations.fColorMatrixUni = kUseUniform;
         programData->fUniLocations.fColorMatrixVecUni = kUseUniform;
-        segments.fFSCode.appendf("\tvec4 matrixedColor;\n");
+        segments.fFSCode.append("\tvec4 matrixedColor;\n");
         const char* color = adjustInColor(inColor);
         addColorMatrix(&segments.fFSCode, "matrixedColor", color);
         inColor = "matrixedColor";
@@ -856,58 +846,78 @@
     // compute the partial coverage (coverage stages and edge aa)
 
     GrStringBuilder inCoverage;
-
+    bool coverageIsZero = ProgramDesc::kTransBlack_ColorInput ==
+                          fProgramDesc.fCoverageInput;
     // we don't need to compute coverage at all if we know the final shader
     // output will be zero and we don't have a dual src blend output.
     if (!wroteFragColorZero ||
         ProgramDesc::kNone_DualSrcOutput != fProgramDesc.fDualSrcOutput) {
 
-        // get edge AA coverage and use it as inCoverage to first coverage stage
-        this->genEdgeCoverage(gl, layout, programData, &inCoverage, &segments);
+        if (!coverageIsZero) {
+            this->genEdgeCoverage(gl,
+                                  layout,
+                                  programData,
+                                  &inCoverage,
+                                  &segments);
 
-        // include explicit per-vertex coverage if we have it
-        if (GrDrawTarget::kCoverage_VertexLayoutBit & layout) {
-            genPerVertexCoverage(&segments, &inCoverage);
-        }
+            switch (fProgramDesc.fCoverageInput) {
+                case ProgramDesc::kSolidWhite_ColorInput:
+                    // empty string implies solid white
+                    break;
+                case ProgramDesc::kAttribute_ColorInput:
+                    genAttributeCoverage(&segments, &inCoverage);
+                    break;
+                case ProgramDesc::kUniform_ColorInput:
+                    genUniformCoverage(&segments, programData, &inCoverage);
+                    break;
+                default:
+                    GrCrash("Unexpected input coverage.");
+            }
 
-        GrStringBuilder outCoverage;
-        const int& startStage = fProgramDesc.fFirstCoverageStage;
-        for (int s = startStage; s < GrDrawState::kNumStages; ++s) {
-            if (fProgramDesc.fStages[s].isEnabled()) {
-                // create var to hold stage output
-                outCoverage = "coverage";
-                outCoverage.appendS32(s);
-                segments.fFSCode.appendf("\tvec4 %s;\n", outCoverage.c_str());
+            GrStringBuilder outCoverage;
+            const int& startStage = fProgramDesc.fFirstCoverageStage;
+            for (int s = startStage; s < GrDrawState::kNumStages; ++s) {
+                if (fProgramDesc.fStages[s].isEnabled()) {
+                    // create var to hold stage output
+                    outCoverage = "coverage";
+                    outCoverage.appendS32(s);
+                    segments.fFSCode.appendf("\tvec4 %s;\n",
+                                             outCoverage.c_str());
 
-                const char* inCoords;
-                // figure out what our input coords are
-                if (GrDrawTarget::StagePosAsTexCoordVertexLayoutBit(s) & layout) {
-                    inCoords = POS_ATTR_NAME;
-                } else {
-                    int tcIdx = GrDrawTarget::VertexTexCoordsForStage(s, layout);
-                        // we better have input tex coordinates if stage is enabled.
-                    GrAssert(tcIdx >= 0);
-                    GrAssert(texCoordAttrs[tcIdx].size());
-                    inCoords = texCoordAttrs[tcIdx].c_str();
+                    const char* inCoords;
+                    // figure out what our input coords are
+                    if (GrDrawTarget::StagePosAsTexCoordVertexLayoutBit(s) &
+                        layout) {
+                        inCoords = POS_ATTR_NAME;
+                    } else {
+                        int tcIdx =
+                            GrDrawTarget::VertexTexCoordsForStage(s, layout);
+                        // we better have input tex coordinates if stage is
+                        // enabled.
+                        GrAssert(tcIdx >= 0);
+                        GrAssert(texCoordAttrs[tcIdx].size());
+                        inCoords = texCoordAttrs[tcIdx].c_str();
+                    }
+
+                    genStageCode(gl, s,
+                                 fProgramDesc.fStages[s],
+                                 inCoverage.size() ? inCoverage.c_str() : NULL,
+                                 outCoverage.c_str(),
+                                 inCoords,
+                                 &segments,
+                                 &programData->fUniLocations.fStages[s]);
+                    inCoverage = outCoverage;
                 }
-
-                genStageCode(gl, s,
-                             fProgramDesc.fStages[s],
-                             inCoverage.size() ? inCoverage.c_str() : NULL,
-                             outCoverage.c_str(),
-                             inCoords,
-                             &segments,
-                             &programData->fUniLocations.fStages[s]);
-                inCoverage = outCoverage;
             }
         }
         if (ProgramDesc::kNone_DualSrcOutput != fProgramDesc.fDualSrcOutput) {
             segments.fFSOutputs.push_back().set(GrGLShaderVar::kVec4f_Type,
                 GrGLShaderVar::kOut_TypeModifier,
                 dual_source_output_name());
-            bool outputIsZero = false;
+            bool outputIsZero = coverageIsZero;
             GrStringBuilder coeff;
-            if (ProgramDesc::kCoverage_DualSrcOutput !=
+            if (!outputIsZero &&
+                ProgramDesc::kCoverage_DualSrcOutput !=
                 fProgramDesc.fDualSrcOutput && !wroteFragColorZero) {
                 if (!inColor.size()) {
                     outputIsZero = true;
@@ -938,17 +948,32 @@
     // combine color and coverage as frag color
 
     if (!wroteFragColorZero) {
-        modulate_helper(fsColorOutput,
-                        inColor.c_str(),
-                        inCoverage.c_str(),
-                        &segments.fFSCode);
-        if (ProgramDesc::kNo_OutputPM == fProgramDesc.fOutputPM) {
-            segments.fFSCode.appendf("\t%s = %s.a <= 0.0 ? vec4(0,0,0,0) : vec4(%s.rgb / %s.a, %s.a);\n",
-                                     fsColorOutput,
-                                     fsColorOutput,
-                                     fsColorOutput,
-                                     fsColorOutput,
-                                     fsColorOutput);
+        if (coverageIsZero) {
+            segments.fFSCode.appendf("\t%s = %s;\n",
+                                     colorOutput.getName().c_str(),
+                                     all_zeros_vec(4));
+        } else {
+            modulate_helper(colorOutput.getName().c_str(),
+                            inColor.c_str(),
+                            inCoverage.c_str(),
+                            &segments.fFSCode);
+        }
+        if (ProgramDesc::kUnpremultiplied_RoundDown_OutputConfig ==
+            fProgramDesc.fOutputConfig) {
+            segments.fFSCode.appendf("\t%s = %s.a <= 0.0 ? vec4(0,0,0,0) : vec4(floor(%s.rgb / %s.a * 255.0)/255.0, %s.a);\n",
+                                        colorOutput.getName().c_str(),
+                                        colorOutput.getName().c_str(),
+                                        colorOutput.getName().c_str(),
+                                        colorOutput.getName().c_str(),
+                                        colorOutput.getName().c_str());
+        } else if (ProgramDesc::kUnpremultiplied_RoundUp_OutputConfig ==
+                   fProgramDesc.fOutputConfig) {
+            segments.fFSCode.appendf("\t%s = %s.a <= 0.0 ? vec4(0,0,0,0) : vec4(ceil(%s.rgb / %s.a * 255.0)/255.0, %s.a);\n",
+                                        colorOutput.getName().c_str(),
+                                        colorOutput.getName().c_str(),
+                                        colorOutput.getName().c_str(),
+                                        colorOutput.getName().c_str(),
+                                        colorOutput.getName().c_str());
         }
     }
 
@@ -958,13 +983,13 @@
     ///////////////////////////////////////////////////////////////////////////
     // insert GS
 #if GR_DEBUG
-    this->genGeometryShader(gl, glslGeneration, &segments);
+    this->genGeometryShader(gl, &segments);
 #endif
 
     ///////////////////////////////////////////////////////////////////////////
     // compile and setup attribs and unis
 
-    if (!CompileShaders(gl, glslGeneration, segments, programData)) {
+    if (!CompileShaders(gl, segments, programData)) {
         return false;
     }
 
@@ -983,12 +1008,11 @@
 namespace {
 
 inline void expand_decls(const VarArray& vars,
-                         const GrGLInterface* gl,
-                         GrStringBuilder* string,
-                         GrGLSLGeneration gen) {
+                         const GrGLContextInfo& gl,
+                         GrStringBuilder* string) {
     const int count = vars.count();
     for (int i = 0; i < count; ++i) {
-        vars[i].appendDecl(gl, string, gen);
+        vars[i].appendDecl(gl, string);
     }
 }
 
@@ -1026,19 +1050,17 @@
 }
 
 inline void append_decls(const VarArray& vars,
-                         const GrGLInterface* gl,
+                         const GrGLContextInfo& gl,
                          StrArray* strings,
                          LengthArray* lengths,
-                         TempArray* temp,
-                         GrGLSLGeneration gen) {
-    expand_decls(vars, gl, &temp->push_back(), gen);
+                         TempArray* temp) {
+    expand_decls(vars, gl, &temp->push_back());
     append_string(temp->back(), strings, lengths);
 }
 
 }
 
-bool GrGLProgram::CompileShaders(const GrGLInterface* gl,
-                                 GrGLSLGeneration glslGeneration,
+bool GrGLProgram::CompileShaders(const GrGLContextInfo& gl,
                                  const ShaderCodeSegments& segments,
                                  CachedData* programData) {
     enum { kPreAllocStringCnt = 8 };
@@ -1052,11 +1074,9 @@
     GrStringBuilder outputs;
 
     append_string(segments.fHeader, &strs, &lengths);
-    append_decls(segments.fVSUnis, gl, &strs, &lengths, &temps, glslGeneration);
-    append_decls(segments.fVSAttrs, gl, &strs, &lengths,
-                 &temps, glslGeneration);
-    append_decls(segments.fVSOutputs, gl, &strs, &lengths,
-                 &temps, glslGeneration);
+    append_decls(segments.fVSUnis, gl, &strs, &lengths, &temps);
+    append_decls(segments.fVSAttrs, gl, &strs, &lengths, &temps);
+    append_decls(segments.fVSOutputs, gl, &strs, &lengths, &temps);
     append_string(segments.fVSCode, &strs, &lengths);
 
 #if PRINT_SHADERS
@@ -1077,10 +1097,8 @@
         temps.reset();
         append_string(segments.fHeader, &strs, &lengths);
         append_string(segments.fGSHeader, &strs, &lengths);
-        append_decls(segments.fGSInputs, gl, &strs, &lengths,
-                     &temps, glslGeneration);
-        append_decls(segments.fGSOutputs, gl, &strs, &lengths,
-                     &temps, glslGeneration);
+        append_decls(segments.fGSInputs, gl, &strs, &lengths, &temps);
+        append_decls(segments.fGSOutputs, gl, &strs, &lengths, &temps);
         append_string(segments.fGSCode, &strs, &lengths);
 #if PRINT_SHADERS
         print_shader(strs.count(), &strs[0], &lengths[0]);
@@ -1098,16 +1116,14 @@
     temps.reset();
 
     append_string(segments.fHeader, &strs, &lengths);
-    GrStringBuilder precisionStr(GrShaderPrecision(gl));
+    GrStringBuilder precisionStr(GrGetGLSLShaderPrecisionDecl(gl.binding()));
     append_string(precisionStr, &strs, &lengths);
-    append_decls(segments.fFSUnis, gl, &strs, &lengths, &temps, glslGeneration);
-    append_decls(segments.fFSInputs, gl, &strs, &lengths,
-                 &temps, glslGeneration);
+    append_decls(segments.fFSUnis, gl, &strs, &lengths, &temps);
+    append_decls(segments.fFSInputs, gl, &strs, &lengths, &temps);
     // We shouldn't have declared outputs on 1.10
-    GrAssert(k110_GLSLGeneration != glslGeneration ||
+    GrAssert(k110_GrGLSLGeneration != gl.glslGeneration() ||
              segments.fFSOutputs.empty());
-    append_decls(segments.fFSOutputs, gl, &strs, &lengths,
-                 &temps, glslGeneration);
+    append_decls(segments.fFSOutputs, gl, &strs, &lengths, &temps);
     append_string(segments.fFSFunctions, &strs, &lengths);
     append_string(segments.fFSCode, &strs, &lengths);
 
@@ -1127,7 +1143,10 @@
     return true;
 }
 
-GrGLuint GrGLProgram::CompileShader(const GrGLInterface* gl,
+#define GL_CALL(X) GR_GL_CALL(gl.interface(), X)
+#define GL_CALL_RET(R, X) GR_GL_CALL_RET(gl.interface(), R, X)
+
+GrGLuint GrGLProgram::CompileShader(const GrGLContextInfo& gl,
                                     GrGLenum type,
                                     int stringCnt,
                                     const char** strings,
@@ -1136,78 +1155,77 @@
                     "stringCount", SkStringPrintf("%i", stringCnt).c_str());
 
     GrGLuint shader;
-    GR_GL_CALL_RET(gl, shader, CreateShader(type));
+    GL_CALL_RET(shader, CreateShader(type));
     if (0 == shader) {
         return 0;
     }
 
     GrGLint compiled = GR_GL_INIT_ZERO;
-    GR_GL_CALL(gl, ShaderSource(shader, stringCnt, strings, stringLengths));
-    GR_GL_CALL(gl, CompileShader(shader));
-    GR_GL_CALL(gl, GetShaderiv(shader, GR_GL_COMPILE_STATUS, &compiled));
+    GL_CALL(ShaderSource(shader, stringCnt, strings, stringLengths));
+    GL_CALL(CompileShader(shader));
+    GL_CALL(GetShaderiv(shader, GR_GL_COMPILE_STATUS, &compiled));
 
     if (!compiled) {
         GrGLint infoLen = GR_GL_INIT_ZERO;
-        GR_GL_CALL(gl, GetShaderiv(shader, GR_GL_INFO_LOG_LENGTH, &infoLen));
+        GL_CALL(GetShaderiv(shader, GR_GL_INFO_LOG_LENGTH, &infoLen));
         SkAutoMalloc log(sizeof(char)*(infoLen+1)); // outside if for debugger
         if (infoLen > 0) {
             // retrieve length even though we don't need it to workaround
             // bug in chrome cmd buffer param validation.
             GrGLsizei length = GR_GL_INIT_ZERO;
-            GR_GL_CALL(gl, GetShaderInfoLog(shader, infoLen+1, 
-                                            &length, (char*)log.get()));
+            GL_CALL(GetShaderInfoLog(shader, infoLen+1, 
+                                         &length, (char*)log.get()));
             print_shader(stringCnt, strings, stringLengths);
             GrPrintf("\n%s", log.get());
         }
         GrAssert(!"Shader compilation failed!");
-        GR_GL_CALL(gl, DeleteShader(shader));
+        GL_CALL(DeleteShader(shader));
         return 0;
     }
     return shader;
 }
 
 bool GrGLProgram::bindOutputsAttribsAndLinkProgram(
-                                        const GrGLInterface* gl,
+                                        const GrGLContextInfo& gl,
                                         GrStringBuilder texCoordAttrNames[],
                                         bool bindColorOut,
                                         bool bindDualSrcOut,
                                         CachedData* programData) const {
-    GR_GL_CALL_RET(gl, programData->fProgramID, CreateProgram());
+    GL_CALL_RET(programData->fProgramID, CreateProgram());
     if (!programData->fProgramID) {
         return false;
     }
     const GrGLint& progID = programData->fProgramID;
 
-    GR_GL_CALL(gl, AttachShader(progID, programData->fVShaderID));
+    GL_CALL(AttachShader(progID, programData->fVShaderID));
     if (programData->fGShaderID) {
-        GR_GL_CALL(gl, AttachShader(progID, programData->fGShaderID));
+        GL_CALL(AttachShader(progID, programData->fGShaderID));
     }
-    GR_GL_CALL(gl, AttachShader(progID, programData->fFShaderID));
+    GL_CALL(AttachShader(progID, programData->fFShaderID));
 
     if (bindColorOut) {
-        GR_GL_CALL(gl, BindFragDataLocation(programData->fProgramID,
-                                          0, declared_color_output_name()));
+        GL_CALL(BindFragDataLocation(programData->fProgramID,
+                                     0, declared_color_output_name()));
     }
     if (bindDualSrcOut) {
-        GR_GL_CALL(gl, BindFragDataLocationIndexed(programData->fProgramID,
-                                          0, 1, dual_source_output_name()));
+        GL_CALL(BindFragDataLocationIndexed(programData->fProgramID,
+                                            0, 1, dual_source_output_name()));
     }
 
     // Bind the attrib locations to same values for all shaders
-    GR_GL_CALL(gl, BindAttribLocation(progID, PositionAttributeIdx(),
-                                      POS_ATTR_NAME));
+    GL_CALL(BindAttribLocation(progID, PositionAttributeIdx(), POS_ATTR_NAME));
     for (int t = 0; t < GrDrawState::kMaxTexCoords; ++t) {
         if (texCoordAttrNames[t].size()) {
-            GR_GL_CALL(gl, BindAttribLocation(progID,
-                                              TexCoordAttributeIdx(t),
-                                              texCoordAttrNames[t].c_str()));
+            GL_CALL(BindAttribLocation(progID,
+                                       TexCoordAttributeIdx(t),
+                                       texCoordAttrNames[t].c_str()));
         }
     }
 
     if (kSetAsAttribute == programData->fUniLocations.fViewMatrixUni) {
-        GR_GL_CALL(gl, BindAttribLocation(progID,
-                                          ViewMatrixAttributeIdx(),
-                                          VIEW_MATRIX_NAME));
+        GL_CALL(BindAttribLocation(progID,
+                                   ViewMatrixAttributeIdx(),
+                                   VIEW_MATRIX_NAME));
     }
 
     for (int s = 0; s < GrDrawState::kNumStages; ++s) {
@@ -1215,76 +1233,80 @@
         if (kSetAsAttribute == unis.fTextureMatrixUni) {
             GrStringBuilder matName;
             tex_matrix_name(s, &matName);
-            GR_GL_CALL(gl, BindAttribLocation(progID,
-                                              TextureMatrixAttributeIdx(s),
-                                              matName.c_str()));
+            GL_CALL(BindAttribLocation(progID,
+                                       TextureMatrixAttributeIdx(s),
+                                       matName.c_str()));
         }
     }
 
-    GR_GL_CALL(gl, BindAttribLocation(progID, ColorAttributeIdx(),
-                                      COL_ATTR_NAME));
-    GR_GL_CALL(gl, BindAttribLocation(progID, CoverageAttributeIdx(),
-                                      COV_ATTR_NAME));
-    GR_GL_CALL(gl, BindAttribLocation(progID, EdgeAttributeIdx(),
-                                      EDGE_ATTR_NAME));
+    GL_CALL(BindAttribLocation(progID, ColorAttributeIdx(), COL_ATTR_NAME));
+    GL_CALL(BindAttribLocation(progID, CoverageAttributeIdx(), COV_ATTR_NAME));
+    GL_CALL(BindAttribLocation(progID, EdgeAttributeIdx(), EDGE_ATTR_NAME));
 
-    GR_GL_CALL(gl, LinkProgram(progID));
+    GL_CALL(LinkProgram(progID));
 
     GrGLint linked = GR_GL_INIT_ZERO;
-    GR_GL_CALL(gl, GetProgramiv(progID, GR_GL_LINK_STATUS, &linked));
+    GL_CALL(GetProgramiv(progID, GR_GL_LINK_STATUS, &linked));
     if (!linked) {
         GrGLint infoLen = GR_GL_INIT_ZERO;
-        GR_GL_CALL(gl, GetProgramiv(progID, GR_GL_INFO_LOG_LENGTH, &infoLen));
+        GL_CALL(GetProgramiv(progID, GR_GL_INFO_LOG_LENGTH, &infoLen));
         SkAutoMalloc log(sizeof(char)*(infoLen+1));  // outside if for debugger
         if (infoLen > 0) {
             // retrieve length even though we don't need it to workaround
             // bug in chrome cmd buffer param validation.
             GrGLsizei length = GR_GL_INIT_ZERO;
-            GR_GL_CALL(gl, GetProgramInfoLog(progID, infoLen+1,
-                                             &length, (char*)log.get()));
+            GL_CALL(GetProgramInfoLog(progID,
+                                      infoLen+1,
+                                      &length,
+                                      (char*)log.get()));
             GrPrintf((char*)log.get());
         }
         GrAssert(!"Error linking program");
-        GR_GL_CALL(gl, DeleteProgram(progID));
+        GL_CALL(DeleteProgram(progID));
         programData->fProgramID = 0;
         return false;
     }
     return true;
 }
 
-void GrGLProgram::getUniformLocationsAndInitCache(const GrGLInterface* gl, 
+void GrGLProgram::getUniformLocationsAndInitCache(const GrGLContextInfo& gl,
                                                   CachedData* programData) const {
     const GrGLint& progID = programData->fProgramID;
 
     if (kUseUniform == programData->fUniLocations.fViewMatrixUni) {
-        GR_GL_CALL_RET(gl, programData->fUniLocations.fViewMatrixUni,
-                       GetUniformLocation(progID, VIEW_MATRIX_NAME));
+        GL_CALL_RET(programData->fUniLocations.fViewMatrixUni,
+                    GetUniformLocation(progID, VIEW_MATRIX_NAME));
         GrAssert(kUnusedUniform != programData->fUniLocations.fViewMatrixUni);
     }
     if (kUseUniform == programData->fUniLocations.fColorUni) {
-        GR_GL_CALL_RET(gl, programData->fUniLocations.fColorUni,
-                       GetUniformLocation(progID, COL_UNI_NAME));
+        GL_CALL_RET(programData->fUniLocations.fColorUni,
+                    GetUniformLocation(progID, COL_UNI_NAME));
         GrAssert(kUnusedUniform != programData->fUniLocations.fColorUni);
     }
     if (kUseUniform == programData->fUniLocations.fColorFilterUni) {
-        GR_GL_CALL_RET(gl, programData->fUniLocations.fColorFilterUni, 
-                       GetUniformLocation(progID, COL_FILTER_UNI_NAME));
+        GL_CALL_RET(programData->fUniLocations.fColorFilterUni, 
+                    GetUniformLocation(progID, COL_FILTER_UNI_NAME));
         GrAssert(kUnusedUniform != programData->fUniLocations.fColorFilterUni);
     }
 
     if (kUseUniform == programData->fUniLocations.fColorMatrixUni) {
-        GR_GL_CALL_RET(gl, programData->fUniLocations.fColorMatrixUni,
-                       GetUniformLocation(progID, COL_MATRIX_UNI_NAME));
+        GL_CALL_RET(programData->fUniLocations.fColorMatrixUni,
+                    GetUniformLocation(progID, COL_MATRIX_UNI_NAME));
     }
 
     if (kUseUniform == programData->fUniLocations.fColorMatrixVecUni) {
-        GR_GL_CALL_RET(gl, programData->fUniLocations.fColorMatrixVecUni,
-                       GetUniformLocation(progID, COL_MATRIX_VEC_UNI_NAME));
+        GL_CALL_RET(programData->fUniLocations.fColorMatrixVecUni,
+                    GetUniformLocation(progID, COL_MATRIX_VEC_UNI_NAME));
+    }
+    if (kUseUniform == programData->fUniLocations.fCoverageUni) {
+        GL_CALL_RET(programData->fUniLocations.fCoverageUni,
+                    GetUniformLocation(progID, COV_UNI_NAME));
+        GrAssert(kUnusedUniform != programData->fUniLocations.fCoverageUni);
     }
 
     if (kUseUniform == programData->fUniLocations.fEdgesUni) {
-        GR_GL_CALL_RET(gl, programData->fUniLocations.fEdgesUni,
-                       GetUniformLocation(progID, EDGES_UNI_NAME));
+        GL_CALL_RET(programData->fUniLocations.fEdgesUni,
+                    GetUniformLocation(progID, EDGES_UNI_NAME));
         GrAssert(kUnusedUniform != programData->fUniLocations.fEdgesUni);
     } else {
         programData->fUniLocations.fEdgesUni = kUnusedUniform;
@@ -1296,65 +1318,65 @@
             if (kUseUniform == locations.fTextureMatrixUni) {
                 GrStringBuilder texMName;
                 tex_matrix_name(s, &texMName);
-                GR_GL_CALL_RET(gl, locations.fTextureMatrixUni,
-                               GetUniformLocation(progID, texMName.c_str()));
+                GL_CALL_RET(locations.fTextureMatrixUni,
+                            GetUniformLocation(progID, texMName.c_str()));
                 GrAssert(kUnusedUniform != locations.fTextureMatrixUni);
             }
 
             if (kUseUniform == locations.fSamplerUni) {
                 GrStringBuilder samplerName;
                 sampler_name(s, &samplerName);
-                GR_GL_CALL_RET(gl, locations.fSamplerUni,
-                               GetUniformLocation(progID,samplerName.c_str()));
+                GL_CALL_RET(locations.fSamplerUni,
+                            GetUniformLocation(progID,samplerName.c_str()));
                 GrAssert(kUnusedUniform != locations.fSamplerUni);
             }
 
             if (kUseUniform == locations.fNormalizedTexelSizeUni) {
                 GrStringBuilder texelSizeName;
                 normalized_texel_size_name(s, &texelSizeName);
-                GR_GL_CALL_RET(gl, locations.fNormalizedTexelSizeUni,
-                               GetUniformLocation(progID, texelSizeName.c_str()));
+                GL_CALL_RET(locations.fNormalizedTexelSizeUni,
+                            GetUniformLocation(progID, texelSizeName.c_str()));
                 GrAssert(kUnusedUniform != locations.fNormalizedTexelSizeUni);
             }
 
             if (kUseUniform == locations.fRadial2Uni) {
                 GrStringBuilder radial2ParamName;
                 radial2_param_name(s, &radial2ParamName);
-                GR_GL_CALL_RET(gl, locations.fRadial2Uni,
-                               GetUniformLocation(progID, radial2ParamName.c_str()));
+                GL_CALL_RET(locations.fRadial2Uni,
+                            GetUniformLocation(progID, radial2ParamName.c_str()));
                 GrAssert(kUnusedUniform != locations.fRadial2Uni);
             }
 
             if (kUseUniform == locations.fTexDomUni) {
                 GrStringBuilder texDomName;
                 tex_domain_name(s, &texDomName);
-                GR_GL_CALL_RET(gl, locations.fTexDomUni,
-                               GetUniformLocation(progID, texDomName.c_str()));
+                GL_CALL_RET(locations.fTexDomUni,
+                            GetUniformLocation(progID, texDomName.c_str()));
                 GrAssert(kUnusedUniform != locations.fTexDomUni);
             }
 
             GrStringBuilder kernelName, imageIncrementName;
             convolve_param_names(s, &kernelName, &imageIncrementName);
             if (kUseUniform == locations.fKernelUni) {
-                GR_GL_CALL_RET(gl, locations.fKernelUni, 
-                               GetUniformLocation(progID, kernelName.c_str()));
+                GL_CALL_RET(locations.fKernelUni,
+                            GetUniformLocation(progID, kernelName.c_str()));
                 GrAssert(kUnusedUniform != locations.fKernelUni);
             }
 
             if (kUseUniform == locations.fImageIncrementUni) {
-                GR_GL_CALL_RET(gl, locations.fImageIncrementUni, 
-                               GetUniformLocation(progID, 
-                                                  imageIncrementName.c_str()));
+                GL_CALL_RET(locations.fImageIncrementUni, 
+                            GetUniformLocation(progID, 
+                                               imageIncrementName.c_str()));
                 GrAssert(kUnusedUniform != locations.fImageIncrementUni);
             }
         }
     }
-    GR_GL_CALL(gl, UseProgram(progID));
+    GL_CALL(UseProgram(progID));
 
     // init sampler unis and set bogus values for state tracking
     for (int s = 0; s < GrDrawState::kNumStages; ++s) {
         if (kUnusedUniform != programData->fUniLocations.fStages[s].fSamplerUni) {
-            GR_GL_CALL(gl, Uniform1i(programData->fUniLocations.fStages[s].fSamplerUni, s));
+            GL_CALL(Uniform1i(programData->fUniLocations.fStages[s].fSamplerUni, s));
         }
         programData->fTextureMatrices[s] = GrMatrix::InvalidMatrix();
         programData->fRadial2CenterX1[s] = GR_ScalarMax;
@@ -1634,14 +1656,76 @@
     segments->fFSCode.appendf("\t\t%s += %s;\n",
                               coordVar.c_str(),
                               imageIncrementName);
-    segments->fFSCode.appendf("\t}\n");
+    segments->fFSCode.append("\t}\n");
     segments->fFSCode.appendf("\t%s = %s%s;\n", fsOutColor,
                               sumVar.c_str(), modulate.c_str());
 }
+ 
+void genMorphologyVS(int stageNum,
+                     const StageDesc& desc,
+                     ShaderCodeSegments* segments,
+                     GrGLProgram::StageUniLocations* locations,
+                     const char** imageIncrementName,
+                     const char* varyingVSName) {
+    GrGLShaderVar* imgInc = &segments->fFSUnis.push_back();
+    imgInc->setType(GrGLShaderVar::kVec2f_Type);
+    imgInc->setTypeModifier(GrGLShaderVar::kUniform_TypeModifier);
+
+    image_increment_param_name(stageNum, imgInc->accessName());
+    *imageIncrementName = imgInc->getName().c_str();
+
+    // need image increment in both VS and FS
+    segments->fVSUnis.push_back(*imgInc).setEmitPrecision(true);
+
+    locations->fImageIncrementUni = kUseUniform;
+    segments->fVSCode.appendf("\t%s -= vec2(%d, %d) * %s;\n",
+                                  varyingVSName, desc.fKernelWidth,
+                                  desc.fKernelWidth, *imageIncrementName);
+}
+ 
+void genMorphologyFS(int stageNum,
+                     const StageDesc& desc,
+                     ShaderCodeSegments* segments,
+                     const char* samplerName,
+                     const char* swizzle,
+                     const char* imageIncrementName,
+                     const char* fsOutColor,
+                     GrStringBuilder& sampleCoords,
+                     GrStringBuilder& texFunc,
+                     GrStringBuilder& modulate) {
+    GrStringBuilder valueVar("value");
+    valueVar.appendS32(stageNum);
+    GrStringBuilder coordVar("coord");
+    coordVar.appendS32(stageNum);
+    bool isDilate = StageDesc::kDilate_FetchMode == desc.fFetchMode;
+
+   if (isDilate) {
+        segments->fFSCode.appendf("\tvec4 %s = vec4(0, 0, 0, 0);\n",
+                                  valueVar.c_str());
+    } else {
+        segments->fFSCode.appendf("\tvec4 %s = vec4(1, 1, 1, 1);\n",
+                                  valueVar.c_str());
+    }
+    segments->fFSCode.appendf("\tvec2 %s = %s;\n", 
+                              coordVar.c_str(),
+                              sampleCoords.c_str());
+    segments->fFSCode.appendf("\tfor (int i = 0; i < %d; i++) {\n",
+                              desc.fKernelWidth * 2 + 1);
+    segments->fFSCode.appendf("\t\t%s = %s(%s, %s(%s, %s)%s);\n",
+                              valueVar.c_str(), isDilate ? "max" : "min",
+                              valueVar.c_str(), texFunc.c_str(),
+                              samplerName, coordVar.c_str(), swizzle);
+    segments->fFSCode.appendf("\t\t%s += %s;\n",
+                              coordVar.c_str(),
+                              imageIncrementName);
+    segments->fFSCode.appendf("\t}\n");
+    segments->fFSCode.appendf("\t%s = %s%s;\n", fsOutColor,
+                              valueVar.c_str(), modulate.c_str());
+}
 
 }
 
-void GrGLProgram::genStageCode(const GrGLInterface* gl,
+void GrGLProgram::genStageCode(const GrGLContextInfo& gl,
                                int stageNum,
                                const GrGLProgram::StageDesc& desc,
                                const char* fsInColor, // NULL means no incoming color
@@ -1738,6 +1822,10 @@
     if (StageDesc::kConvolution_FetchMode == desc.fFetchMode) {
         genConvolutionVS(stageNum, desc, segments, locations,
                          &kernel, &imageIncrementName, varyingVSName);
+    } else if (StageDesc::kDilate_FetchMode == desc.fFetchMode ||
+               StageDesc::kErode_FetchMode == desc.fFetchMode) {
+        genMorphologyVS(stageNum, desc, segments, locations,
+                        &imageIncrementName, varyingVSName);
     }
 
     /// Fragment Shader Stuff
@@ -1801,13 +1889,16 @@
 
     };
 
+    static const uint32_t kMulByAlphaMask =
+        (StageDesc::kMulRGBByAlpha_RoundUp_InConfigFlag |
+         StageDesc::kMulRGBByAlpha_RoundDown_InConfigFlag);
+
     const char* swizzle = "";
     if (desc.fInConfigFlags & StageDesc::kSwapRAndB_InConfigFlag) {
         GrAssert(!(desc.fInConfigFlags & StageDesc::kSmearAlpha_InConfigFlag));
         swizzle = ".bgra";
     } else if (desc.fInConfigFlags & StageDesc::kSmearAlpha_InConfigFlag) {
-        GrAssert(!(desc.fInConfigFlags &
-                   StageDesc::kMulRGBByAlpha_InConfigFlag));
+        GrAssert(!(desc.fInConfigFlags & kMulByAlphaMask));
         swizzle = ".aaaa";
     } 
 
@@ -1835,30 +1926,44 @@
 
     switch (desc.fFetchMode) {
     case StageDesc::k2x2_FetchMode:
-        GrAssert(!(desc.fInConfigFlags &
-                   StageDesc::kMulRGBByAlpha_InConfigFlag));
+        GrAssert(!(desc.fInConfigFlags & kMulByAlphaMask));
         gen2x2FS(stageNum, segments, locations, &sampleCoords,
             samplerName, texelSizeName, swizzle, fsOutColor,
             texFunc, modulate, complexCoord, coordDims);
         break;
     case StageDesc::kConvolution_FetchMode:
-        GrAssert(!(desc.fInConfigFlags &
-                   StageDesc::kMulRGBByAlpha_InConfigFlag));
+        GrAssert(!(desc.fInConfigFlags & kMulByAlphaMask));
         genConvolutionFS(stageNum, desc, segments,
             samplerName, kernel, swizzle, imageIncrementName, fsOutColor,
             sampleCoords, texFunc, modulate);
         break;
+    case StageDesc::kDilate_FetchMode:
+    case StageDesc::kErode_FetchMode:
+        GrAssert(!(desc.fInConfigFlags & kMulByAlphaMask));
+        genMorphologyFS(stageNum, desc, segments,
+            samplerName, swizzle, imageIncrementName, fsOutColor,
+            sampleCoords, texFunc, modulate);
+        break;
     default:
-        if (desc.fInConfigFlags & StageDesc::kMulRGBByAlpha_InConfigFlag) {
+        if (desc.fInConfigFlags & kMulByAlphaMask) {
+            // only one of the mul by alpha flags should be set
+            GrAssert(GrIsPow2(kMulByAlphaMask & desc.fInConfigFlags));
             GrAssert(!(desc.fInConfigFlags & 
                        StageDesc::kSmearAlpha_InConfigFlag));
             segments->fFSCode.appendf("\t%s = %s(%s, %s)%s;\n",
                                       fsOutColor, texFunc.c_str(), 
                                       samplerName, sampleCoords.c_str(),
                                       swizzle);
-            segments->fFSCode.appendf("\t%s = vec4(%s.rgb*%s.a,%s.a)%s;\n",
-                                      fsOutColor, fsOutColor, fsOutColor,
-                                      fsOutColor, modulate.c_str());
+            if (desc.fInConfigFlags &
+                StageDesc::kMulRGBByAlpha_RoundUp_InConfigFlag) {
+                segments->fFSCode.appendf("\t%s = vec4(ceil(%s.rgb*%s.a*255.0)/255.0,%s.a)%s;\n",
+                                          fsOutColor, fsOutColor, fsOutColor,
+                                          fsOutColor, modulate.c_str());
+            } else {
+                segments->fFSCode.appendf("\t%s = vec4(floor(%s.rgb*%s.a*255.0)/255.0,%s.a)%s;\n",
+                                          fsOutColor, fsOutColor, fsOutColor,
+                                          fsOutColor, modulate.c_str());
+            }
         } else {
             segments->fFSCode.appendf("\t%s = %s(%s, %s)%s%s;\n",
                                       fsOutColor, texFunc.c_str(), 
diff --git a/src/gpu/GrGLProgram.h b/src/gpu/gl/GrGLProgram.h
similarity index 84%
rename from src/gpu/GrGLProgram.h
rename to src/gpu/gl/GrGLProgram.h
index b4ad4af..76f9c90 100644
--- a/src/gpu/GrGLProgram.h
+++ b/src/gpu/gl/GrGLProgram.h
@@ -10,11 +10,11 @@
 #ifndef GrGLProgram_DEFINED
 #define GrGLProgram_DEFINED
 
-#include "GrDrawState.h"
-#include "GrGLInterface.h"
+#include "../GrDrawState.h"
+#include "GrGLContextInfo.h"
 #include "GrGLSL.h"
-#include "GrStringBuilder.h"
-#include "GrGpu.h"
+#include "../GrStringBuilder.h"
+#include "../GrGpu.h"
 
 #include "SkXfermode.h"
 
@@ -48,8 +48,7 @@
      *  The result of heavy init is not stored in datamembers of GrGLProgam,
      *  but in a separate cacheable container.
      */
-    bool genProgram(const GrGLInterface* gl,
-                    GrGLSLGeneration glslVersion,
+    bool genProgram(const GrGLContextInfo& gl,
                     CachedData* programData) const;
 
      /**
@@ -87,13 +86,17 @@
             memset(this, 0, sizeof(ProgramDesc));
         }
 
-        enum OutputPM {
+        enum OutputConfig {
             // PM-color OR color with no alpha channel
-            kYes_OutputPM,
-            // nonPM-color with alpha channel
-            kNo_OutputPM,
+            kPremultiplied_OutputConfig,
+            // nonPM-color with alpha channel. Round components up after
+            // dividing by alpha. Assumes output is 8 bits for r, g, and b
+            kUnpremultiplied_RoundUp_OutputConfig,
+            // nonPM-color with alpha channel. Round components down after
+            // dividing by alpha. Assumes output is 8 bits for r, g, and b
+            kUnpremultiplied_RoundDown_OutputConfig,
 
-            kOutputPMCnt
+            kOutputConfigCnt
         };
 
         struct StageDesc {
@@ -107,6 +110,8 @@
                 kSingle_FetchMode,
                 k2x2_FetchMode,
                 kConvolution_FetchMode,
+                kErode_FetchMode,
+                kDilate_FetchMode,
 
                 kFetchModeCnt,
             };
@@ -115,7 +120,7 @@
               described are performed after reading a texel.
              */
             enum InConfigFlags {
-                kNone_InConfigFlag              = 0x0,
+                kNone_InConfigFlag                      = 0x0,
 
                 /**
                   Swap the R and B channels. This is incompatible with
@@ -123,21 +128,27 @@
                   the shader using GL_ARB_texture_swizzle if possible rather
                   than setting this flag.
                  */
-                kSwapRAndB_InConfigFlag         = 0x1,
+                kSwapRAndB_InConfigFlag                 = 0x1,
 
                 /**
                  Smear alpha across all four channels. This is incompatible with
-                 kSwapRAndB and kPremul.  It is prefereable to perform the
-                 smear outside the shader using GL_ARB_texture_swizzle if
+                 kSwapRAndB and kMulRGBByAlpha*. It is prefereable to perform
+                 the smear outside the shader using GL_ARB_texture_swizzle if
                  possible rather than setting this flag.
                 */
-                kSmearAlpha_InConfigFlag        = 0x2,
+                kSmearAlpha_InConfigFlag                = 0x2,
 
                 /**
                  Multiply r,g,b by a after texture reads. This flag incompatible
                  with kSmearAlpha and may only be used with FetchMode kSingle.
+
+                 It is assumed the src texture has 8bit color components. After
+                 reading the texture one version rounds up to the next multiple
+                 of 1/255.0 and the other rounds down. At most one of these
+                 flags may be set.
                  */
-                kMulRGBByAlpha_InConfigFlag     =  0x4,
+                kMulRGBByAlpha_RoundUp_InConfigFlag     =  0x4,
+                kMulRGBByAlpha_RoundDown_InConfigFlag   =  0x8,
 
                 kDummyInConfigFlag,
                 kInConfigBitMask = (kDummyInConfigFlag-1) |
@@ -211,7 +222,8 @@
 #endif
 
         uint8_t fColorInput;        // casts to enum ColorInput
-        uint8_t fOutputPM;          // cases to enum OutputPM
+        uint8_t fCoverageInput;     // casts to enum CoverageInput
+        uint8_t fOutputConfig;      // casts to enum OutputConfig
         uint8_t fDualSrcOutput;     // casts to enum DualSrcOutput
         int8_t fFirstCoverageStage;
         SkBool8 fEmitsPointSize;
@@ -261,6 +273,7 @@
     struct UniLocations {
         GrGLint fViewMatrixUni;
         GrGLint fColorUni;
+        GrGLint fCoverageUni;
         GrGLint fEdgesUni;
         GrGLint fColorFilterUni;
         GrGLint fColorMatrixUni;
@@ -269,6 +282,7 @@
         void reset() {
             fViewMatrixUni = kUnusedUniform;
             fColorUni = kUnusedUniform;
+            fCoverageUni = kUnusedUniform;
             fEdgesUni = kUnusedUniform;
             fColorFilterUni = kUnusedUniform;
             fColorMatrixUni = kUnusedUniform;
@@ -306,6 +320,7 @@
         // these reflect the current values of uniforms
         // (GL uniform values travel with program)
         GrColor                     fColor;
+        GrColor                     fCoverage;
         GrColor                     fColorFilterColor;
         GrMatrix                    fTextureMatrices[GrDrawState::kNumStages];
         // width and height used for normalized texel size
@@ -335,7 +350,7 @@
 private:
 
     // Determines which uniforms will need to be bound.
-    void genStageCode(const GrGLInterface* gl,
+    void genStageCode(const GrGLContextInfo& gl,
                       int stageNum,
                       const ProgramDesc::StageDesc& desc,
                       const char* fsInColor, // NULL means no incoming color
@@ -344,25 +359,23 @@
                       ShaderCodeSegments* segments,
                       StageUniLocations* locations) const;
 
-    void genGeometryShader(const GrGLInterface* gl,
-                           GrGLSLGeneration glslVersion,
+    void genGeometryShader(const GrGLContextInfo& gl,
                            ShaderCodeSegments* segments) const;
 
     // generates code to compute coverage based on edge AA.
-    void genEdgeCoverage(const GrGLInterface* gl,
+    void genEdgeCoverage(const GrGLContextInfo& gl,
                          GrVertexLayout layout,
                          CachedData* programData,
                          GrStringBuilder* coverageVar,
                          ShaderCodeSegments* segments) const;
 
-    static bool CompileShaders(const GrGLInterface* gl,
-                               GrGLSLGeneration glslVersion,
+    static bool CompileShaders(const GrGLContextInfo& gl,
                                const ShaderCodeSegments& segments, 
                                CachedData* programData);
 
     // Compiles a GL shader, returns shader ID or 0 if failed
     // params have same meaning as glShaderSource
-    static GrGLuint CompileShader(const GrGLInterface* gl,
+    static GrGLuint CompileShader(const GrGLContextInfo& gl,
                                   GrGLenum type, int stringCnt,
                                   const char** strings,
                                   int* stringLengths);
@@ -370,14 +383,14 @@
     // Creates a GL program ID, binds shader attributes to GL vertex attrs, and
     // links the program
     bool bindOutputsAttribsAndLinkProgram(
-                const GrGLInterface* gl,
+                const GrGLContextInfo& gl,
                 GrStringBuilder texCoordAttrNames[GrDrawState::kMaxTexCoords],
                 bool bindColorOut,
                 bool bindDualSrcOut,
                 CachedData* programData) const;
 
     // Binds uniforms; initializes cache to invalid values.
-    void getUniformLocationsAndInitCache(const GrGLInterface* gl,
+    void getUniformLocationsAndInitCache(const GrGLContextInfo& gl,
                                          CachedData* programData) const;
 
     friend class GrGpuGLShaders;
diff --git a/src/gpu/GrGLRenderTarget.cpp b/src/gpu/gl/GrGLRenderTarget.cpp
similarity index 100%
rename from src/gpu/GrGLRenderTarget.cpp
rename to src/gpu/gl/GrGLRenderTarget.cpp
diff --git a/src/gpu/GrGLRenderTarget.h b/src/gpu/gl/GrGLRenderTarget.h
similarity index 100%
rename from src/gpu/GrGLRenderTarget.h
rename to src/gpu/gl/GrGLRenderTarget.h
diff --git a/src/gpu/gl/GrGLSL.cpp b/src/gpu/gl/GrGLSL.cpp
new file mode 100644
index 0000000..e933ee8
--- /dev/null
+++ b/src/gpu/gl/GrGLSL.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2011 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "GrGLSL.h"
+#include "GrGLShaderVar.h"
+
+GrGLSLGeneration GrGetGLSLGeneration(GrGLBinding binding,
+                                   const GrGLInterface* gl) {
+    GrGLSLVersion ver = GrGLGetGLSLVersion(gl);
+    switch (binding) {
+        case kDesktop_GrGLBinding:
+            GrAssert(ver >= GR_GLSL_VER(1,10));
+            if (ver >= GR_GLSL_VER(1,50)) {
+                return k150_GrGLSLGeneration;
+            } else if (ver >= GR_GLSL_VER(1,30)) {
+                return k130_GrGLSLGeneration;
+            } else {
+                return k110_GrGLSLGeneration;
+            }
+        case kES2_GrGLBinding:
+            // version 1.00 of ES GLSL based on ver 1.20 of desktop GLSL
+            GrAssert(ver >= GR_GL_VER(1,00));
+            return k110_GrGLSLGeneration;
+        default:
+            GrCrash("Unknown GL Binding");
+            return k110_GrGLSLGeneration; // suppress warning
+    }
+}
+
+const char* GrGetGLSLVersionDecl(GrGLBinding binding,
+                                   GrGLSLGeneration gen) {
+    switch (gen) {
+        case k110_GrGLSLGeneration:
+            if (kES2_GrGLBinding == binding) {
+                // ES2s shader language is based on version 1.20 but is version
+                // 1.00 of the ES language.
+                return "#version 100\n";
+            } else {
+                GrAssert(kDesktop_GrGLBinding == binding);
+                return "#version 110\n";
+            }
+        case k130_GrGLSLGeneration:
+            GrAssert(kDesktop_GrGLBinding == binding);
+            return "#version 130\n";
+        case k150_GrGLSLGeneration:
+            GrAssert(kDesktop_GrGLBinding == binding);
+            return "#version 150\n";
+        default:
+            GrCrash("Unknown GL version.");
+            return ""; // suppress warning
+    }
+}
+
+const char* GrGetGLSLVarPrecisionDeclType(GrGLBinding binding) {
+    if (kES2_GrGLBinding == binding) {
+        return "mediump";
+    } else {
+        return " ";
+    }
+}
+
+const char* GrGetGLSLShaderPrecisionDecl(GrGLBinding binding) {
+    if (kES2_GrGLBinding == binding) {
+        return "precision mediump float;\n";
+    } else {
+        return "";
+    }
+}
+
+bool GrGLSLSetupFSColorOuput(GrGLSLGeneration gen,
+                             const char* nameIfDeclared,
+                             GrGLShaderVar* var) {
+    bool declaredOutput = k110_GrGLSLGeneration != gen;
+    var->set(GrGLShaderVar::kVec4f_Type,
+             GrGLShaderVar::kOut_TypeModifier,
+             declaredOutput ? nameIfDeclared : "gl_FragColor");
+    return declaredOutput;
+}
diff --git a/src/gpu/gl/GrGLSL.h b/src/gpu/gl/GrGLSL.h
new file mode 100644
index 0000000..a3d3921
--- /dev/null
+++ b/src/gpu/gl/GrGLSL.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2011 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef GrGLSL_DEFINED
+#define GrGLSL_DEFINED
+
+#include "gl/GrGLInterface.h"
+
+class GrGLShaderVar;
+
+// Limited set of GLSL versions we build shaders for. Caller should round
+// down the GLSL version to one of these enums.
+enum GrGLSLGeneration {
+    /**
+     * Desktop GLSL 1.10 and ES2 shading lang (based on desktop GLSL 1.20)
+     */
+    k110_GrGLSLGeneration,
+    /**
+     * Desktop GLSL 1.30
+     */
+    k130_GrGLSLGeneration,
+    /**
+     * Dekstop GLSL 1.50
+     */
+    k150_GrGLSLGeneration,
+};
+
+/**
+ * Gets the most recent GLSL Generation compatible with the OpenGL context.
+ */
+GrGLSLGeneration GrGetGLSLGeneration(GrGLBinding binding,
+                                     const GrGLInterface* gl);
+
+/**
+ * Returns a string to include at the begining of a shader to declare the GLSL
+ * version.
+ */
+const char* GrGetGLSLVersionDecl(GrGLBinding binding,
+                                 GrGLSLGeneration v);
+
+/**
+ * Returns a string to include in a variable decleration to set the fp precision
+ * or an emptry string if precision is not required.
+ */
+const char* GrGetGLSLVarPrecisionDeclType(GrGLBinding binding);
+
+/**
+ * Returns a string to set the default fp precision for an entire shader, or
+ * an emptry string if precision is not required.
+ */
+const char* GrGetGLSLShaderPrecisionDecl(GrGLBinding binding);
+
+/**
+ * Depending on the GLSL version being emitted there may be an assumed output
+ * variable from the fragment shader for the color. Otherwise, the shader must
+ * declare an output variable for the color. If this function returns true:
+ *    * Parameter var's name will be set to nameIfDeclared
+ *    * The variable must be declared in the fragment shader
+ *    * The variable has to be bound as the color output 
+ *      (using glBindFragDataLocation)
+ *    If the function returns false:
+ *    * Parameter var's name will be set to the GLSL built-in color output name.
+ *    * Do not declare the variable in the shader.
+ *    * Do not use glBindFragDataLocation to bind the variable
+ * In either case var is initialized to represent the color output in the
+ * shader.
+ */
+ bool GrGLSLSetupFSColorOuput(GrGLSLGeneration gen,
+                             const char* nameIfDeclared,
+                             GrGLShaderVar* var);
+
+#endif
diff --git a/src/gpu/GrGLShaderVar.h b/src/gpu/gl/GrGLShaderVar.h
similarity index 93%
rename from src/gpu/GrGLShaderVar.h
rename to src/gpu/gl/GrGLShaderVar.h
index 1d5d7ca..dc7d52b 100644
--- a/src/gpu/GrGLShaderVar.h
+++ b/src/gpu/gl/GrGLShaderVar.h
@@ -9,9 +9,9 @@
 #ifndef GrGLShaderVar_DEFINED
 #define GrGLShaderVar_DEFINED
 
-#include "GrGLInterface.h"
+#include "GrGLContextInfo.h"
 #include "GrGLSL.h"
-#include "GrStringBuilder.h"
+#include "../GrStringBuilder.h"
 
 #define USE_UNIFORM_FLOAT_ARRAYS true
 
@@ -200,14 +200,14 @@
     /**
      * Write a declaration of this variable to out.
      */
-    void appendDecl(const GrGLInterface* gl, GrStringBuilder* out,
-                    GrGLSLGeneration gen) const {
+    void appendDecl(const GrGLContextInfo& gl, GrStringBuilder* out) const {
         if (this->getTypeModifier() != kNone_TypeModifier) {
-           out->append(TypeModifierString(this->getTypeModifier(), gen));
+           out->append(TypeModifierString(this->getTypeModifier(),
+                                          gl.glslGeneration()));
            out->append(" ");
         }
         if (this->emitsPrecision()) {
-            out->append(PrecisionString(gl));
+            out->append(GrGetGLSLVarPrecisionDeclType(gl.binding()));
             out->append(" ");
         }
         Type effectiveType = this->getType();
@@ -268,23 +268,19 @@
     }
 
 private:
-    static const char* PrecisionString(const GrGLInterface* gl) {
-        return gl->supportsDesktop() ? "" : "mediump";
-    }
-
     static const char* TypeModifierString(TypeModifier t,
                                           GrGLSLGeneration gen) {
         switch (t) {
             case kNone_TypeModifier:
                 return "";
             case kOut_TypeModifier:
-                return k110_GLSLGeneration == gen ? "varying" : "out";
+                return k110_GrGLSLGeneration == gen ? "varying" : "out";
             case kIn_TypeModifier:
-                return k110_GLSLGeneration == gen ? "varying" : "in";
+                return k110_GrGLSLGeneration == gen ? "varying" : "in";
             case kUniform_TypeModifier:
                 return "uniform";
             case kAttribute_TypeModifier:
-                return k110_GLSLGeneration == gen ? "attribute" : "in";
+                return k110_GrGLSLGeneration == gen ? "attribute" : "in";
             default:
                 GrCrash("Unknown shader variable type modifier.");
                 return ""; // suppress warning
diff --git a/src/gpu/GrGLStencilBuffer.cpp b/src/gpu/gl/GrGLStencilBuffer.cpp
similarity index 100%
rename from src/gpu/GrGLStencilBuffer.cpp
rename to src/gpu/gl/GrGLStencilBuffer.cpp
diff --git a/src/gpu/GrGLStencilBuffer.h b/src/gpu/gl/GrGLStencilBuffer.h
similarity index 92%
rename from src/gpu/GrGLStencilBuffer.h
rename to src/gpu/gl/GrGLStencilBuffer.h
index eaf7942..908921a 100644
--- a/src/gpu/GrGLStencilBuffer.h
+++ b/src/gpu/gl/GrGLStencilBuffer.h
@@ -10,12 +10,13 @@
 #ifndef GrGLStencilBuffer_DEFINED
 #define GrGLStencilBuffer_DEFINED
 
-#include "GrGLInterface.h"
-#include "GrStencilBuffer.h"
+#include "gl/GrGLInterface.h"
+#include "../GrStencilBuffer.h"
 
 class GrGLStencilBuffer : public GrStencilBuffer {
 public:
     static const GrGLenum kUnknownInternalFormat = ~0;
+    static const GrGLuint kUnknownBitCount = ~0;
     struct Format {
         GrGLenum  fInternalFormat;
         GrGLuint  fStencilBits;
diff --git a/src/gpu/GrGLTexture.cpp b/src/gpu/gl/GrGLTexture.cpp
similarity index 100%
rename from src/gpu/GrGLTexture.cpp
rename to src/gpu/gl/GrGLTexture.cpp
diff --git a/src/gpu/GrGLTexture.h b/src/gpu/gl/GrGLTexture.h
similarity index 99%
rename from src/gpu/GrGLTexture.h
rename to src/gpu/gl/GrGLTexture.h
index 664742c..d13fc44 100644
--- a/src/gpu/GrGLTexture.h
+++ b/src/gpu/gl/GrGLTexture.h
@@ -10,7 +10,7 @@
 #ifndef GrGLTexture_DEFINED
 #define GrGLTexture_DEFINED
 
-#include "GrGpu.h"
+#include "../GrGpu.h"
 #include "GrGLRenderTarget.h"
 
 /**
diff --git a/src/gpu/GrGLUtil.cpp b/src/gpu/gl/GrGLUtil.cpp
similarity index 93%
rename from src/gpu/GrGLUtil.cpp
rename to src/gpu/gl/GrGLUtil.cpp
index f12b407..23ed5b4 100644
--- a/src/gpu/GrGLUtil.cpp
+++ b/src/gpu/gl/GrGLUtil.cpp
@@ -7,8 +7,8 @@
  */
 
 
-#include "GrGLConfig.h"
-#include "GrGLInterface.h"
+#include "gl/GrGLConfig.h"
+#include "gl/GrGLInterface.h"
 
 void GrGLClearErr(const GrGLInterface* gl) {
     while (GR_GL_NO_ERROR != gl->fGetError()) {}
diff --git a/src/gpu/GrGLVertexBuffer.cpp b/src/gpu/gl/GrGLVertexBuffer.cpp
similarity index 74%
rename from src/gpu/GrGLVertexBuffer.cpp
rename to src/gpu/gl/GrGLVertexBuffer.cpp
index 33c1e7e..48479dc 100644
--- a/src/gpu/GrGLVertexBuffer.cpp
+++ b/src/gpu/gl/GrGLVertexBuffer.cpp
@@ -101,25 +101,44 @@
     }
     this->bind();
     GrGLenum usage = dynamic() ? GR_GL_DYNAMIC_DRAW : GR_GL_STATIC_DRAW;
-#if !GR_GL_USE_BUFFER_DATA_NULL_HINT
-    // Note that we're cheating on the size here. Currently no methods
-    // allow a partial update that preserves contents of non-updated
-    // portions of the buffer (and lock() does a glBufferData(..size, NULL..))
-    GL_CALL(BufferData(GR_GL_ARRAY_BUFFER, srcSizeInBytes, src, usage));
-#else
+
+#if GR_GL_USE_BUFFER_DATA_NULL_HINT
     if (this->sizeInBytes() == srcSizeInBytes) {
         GL_CALL(BufferData(GR_GL_ARRAY_BUFFER, srcSizeInBytes, src, usage));
     } else {
         // Before we call glBufferSubData we give the driver a hint using
         // glBufferData with NULL. This makes the old buffer contents
-        // inaccessible to future draws. The GPU may still be processing draws
-        // that reference the old contents. With this hint it can assign a
-        // different allocation for the new contents to avoid flushing the gpu
-        // past draws consuming the old contents.
+        // inaccessible to future draws. The GPU may still be processing
+        // draws that reference the old contents. With this hint it can
+        // assign a different allocation for the new contents to avoid
+        // flushing the gpu past draws consuming the old contents.
         GL_CALL(BufferData(GR_GL_ARRAY_BUFFER, 
                            this->sizeInBytes(), NULL, usage));
         GL_CALL(BufferSubData(GR_GL_ARRAY_BUFFER, 0, srcSizeInBytes, src));
     }
+#else
+    // Note that we're cheating on the size here. Currently no methods
+    // allow a partial update that preserves contents of non-updated
+    // portions of the buffer (lock() does a glBufferData(..size, NULL..))
+    bool doSubData = false;
+#if GR_GL_MAC_BUFFER_OBJECT_PERFOMANCE_WORKAROUND
+    static int N = 0;
+    // 128 was chosen experimentally. At 256 a slight hitchiness was noticed
+    // when dragging a Chromium window around with a canvas tab backgrounded.
+    doSubData = 0 == (N % 128);
+    ++N;
+#endif
+    if (doSubData) {
+        // The workaround is to do a glBufferData followed by glBufferSubData.
+        // Chromium's command buffer may turn a glBufferSubData where the size
+        // exactly matches the buffer size into a glBufferData. So we tack 1
+        // extra byte onto the glBufferData.
+        GL_CALL(BufferData(GR_GL_ARRAY_BUFFER, srcSizeInBytes + 1,
+                           NULL, usage));
+        GL_CALL(BufferSubData(GR_GL_ARRAY_BUFFER, 0, srcSizeInBytes, src));
+    } else {
+        GL_CALL(BufferData(GR_GL_ARRAY_BUFFER, srcSizeInBytes, src, usage));
+    }
 #endif
     return true;
 }
diff --git a/src/gpu/GrGLVertexBuffer.h b/src/gpu/gl/GrGLVertexBuffer.h
similarity index 94%
rename from src/gpu/GrGLVertexBuffer.h
rename to src/gpu/gl/GrGLVertexBuffer.h
index 15fc54a..5d2ba30 100644
--- a/src/gpu/GrGLVertexBuffer.h
+++ b/src/gpu/gl/GrGLVertexBuffer.h
@@ -11,8 +11,8 @@
 #ifndef GrGLVertexBuffer_DEFINED
 #define GrGLVertexBuffer_DEFINED
 
-#include "GrVertexBuffer.h"
-#include "GrGLInterface.h"
+#include "../GrVertexBuffer.h"
+#include "gl/GrGLInterface.h"
 
 class GrGpuGL;
 
diff --git a/src/gpu/GrGpuGL.cpp b/src/gpu/gl/GrGpuGL.cpp
similarity index 80%
rename from src/gpu/GrGpuGL.cpp
rename to src/gpu/gl/GrGpuGL.cpp
index d0fb12f..69880e5 100644
--- a/src/gpu/GrGpuGL.cpp
+++ b/src/gpu/gl/GrGpuGL.cpp
@@ -24,6 +24,19 @@
 
 #define SKIP_CACHE_CHECK    true
 
+#if GR_GL_CHECK_ALLOC_WITH_GET_ERROR
+    #define CLEAR_ERROR_BEFORE_ALLOC(iface)   GrGLClearErr(iface)
+    #define GL_ALLOC_CALL(iface, call)        GR_GL_CALL_NOERRCHECK(iface, call)
+    #define CHECK_ALLOC_ERROR(iface)          GR_GL_GET_ERROR(iface)
+#else 
+    #define CLEAR_ERROR_BEFORE_ALLOC(iface)
+    #define GL_ALLOC_CALL(iface, call)        GR_GL_CALL(iface, call)
+    #define CHECK_ALLOC_ERROR(iface)          GR_GL_NO_ERROR
+#endif
+
+
+///////////////////////////////////////////////////////////////////////////////
+
 static const GrGLenum gXfermodeCoeff2Blend[] = {
     GR_GL_ZERO,
     GR_GL_ONE,
@@ -164,29 +177,17 @@
     return status == GR_GL_FRAMEBUFFER_COMPLETE;
 }
 
-GrGpuGL::GrGpuGL(const GrGLInterface* gl, GrGLBinding glBinding) {
+GrGpuGL::GrGpuGL(const GrGLContextInfo& ctxInfo) : fGLContextInfo(ctxInfo) {
+
+    GrAssert(ctxInfo.isInitialized());
 
     fPrintedCaps = false;
 
-    gl->ref();
-    fGL = gl;
-    fGLBinding = glBinding;
-    switch (glBinding) {
-        case kDesktop_GrGLBinding:
-            GrAssert(gl->supportsDesktop());
-            break;
-        case kES2_GrGLBinding:
-            GrAssert(gl->supportsES2());
-            break;
-        default:
-            GrCrash("Expect exactly one valid GL binding bit to be in use.");
-    }
+    GrGLClearErr(fGLContextInfo.interface());
 
-    GrGLClearErr(fGL);
-
-    const GrGLubyte* ext;
-    GL_CALL_RET(ext, GetString(GR_GL_EXTENSIONS));
     if (gPrintStartupSpew) {
+        const GrGLubyte* ext;
+        GL_CALL_RET(ext, GetString(GR_GL_EXTENSIONS));
         const GrGLubyte* vendor;
         const GrGLubyte* renderer;
         const GrGLubyte* version;
@@ -201,15 +202,12 @@
         GrPrintf("------ EXTENSIONS\n %s \n", ext);
     }
 
-    fGLVersion = GrGLGetVersion(gl);
-    GrAssert(0 != fGLVersion);
-    fExtensionString = (const char*) ext;
-
     this->resetDirtyFlags();
 
     this->initCaps();
 
     fLastSuccessfulStencilFmtIdx = 0;
+    fCanPreserveUnpremulRoundtrip = kUnknown_CanPreserveUnpremulRoundtrip;
 }
 
 GrGpuGL::~GrGpuGL() {
@@ -218,39 +216,27 @@
     // This subclass must do this before the base class destructor runs
     // since we will unref the GrGLInterface.
     this->releaseResources();
-    fGL->unref();
 }
 
 ///////////////////////////////////////////////////////////////////////////////
 
-static const GrGLuint kUnknownBitCount = ~0;
-
 void GrGpuGL::initCaps() {
     GrGLint maxTextureUnits;
     // check FS and fixed-function texture unit limits
     // we only use textures in the fragment stage currently.
     // checks are > to make sure we have a spare unit.
-    GR_GL_GetIntegerv(fGL, GR_GL_MAX_TEXTURE_IMAGE_UNITS, &maxTextureUnits);
+    const GrGLInterface* gl = this->glInterface();
+    GR_GL_GetIntegerv(gl, GR_GL_MAX_TEXTURE_IMAGE_UNITS, &maxTextureUnits);
     GrAssert(maxTextureUnits > GrDrawState::kNumStages);
     if (kES2_GrGLBinding != this->glBinding()) {
-        GR_GL_GetIntegerv(fGL, GR_GL_MAX_TEXTURE_UNITS, &maxTextureUnits);
+        GR_GL_GetIntegerv(gl, GR_GL_MAX_TEXTURE_UNITS, &maxTextureUnits);
         GrAssert(maxTextureUnits > GrDrawState::kNumStages);
     }
-    if (kES2_GrGLBinding == this->glBinding()) {
-        GR_GL_GetIntegerv(fGL, GR_GL_MAX_FRAGMENT_UNIFORM_VECTORS,
-                          &fGLCaps.fMaxFragmentUniformVectors);
-    } else if (kDesktop_GrGLBinding != this->glBinding()) {
-        GrGLint max;
-        GR_GL_GetIntegerv(fGL, GR_GL_MAX_FRAGMENT_UNIFORM_COMPONENTS, &max);
-        fGLCaps.fMaxFragmentUniformVectors = max / 4;
-    } else {
-        fGLCaps.fMaxFragmentUniformVectors = 16;
-    }
 
     GrGLint numFormats;
-    GR_GL_GetIntegerv(fGL, GR_GL_NUM_COMPRESSED_TEXTURE_FORMATS, &numFormats);
+    GR_GL_GetIntegerv(gl, GR_GL_NUM_COMPRESSED_TEXTURE_FORMATS, &numFormats);
     SkAutoSTMalloc<10, GrGLint> formats(numFormats);
-    GR_GL_GetIntegerv(fGL, GR_GL_COMPRESSED_TEXTURE_FORMATS, formats);
+    GR_GL_GetIntegerv(gl, GR_GL_COMPRESSED_TEXTURE_FORMATS, formats);
     for (int i = 0; i < numFormats; ++i) {
         if (formats[i] == GR_GL_PALETTE8_RGBA8) {
             fCaps.f8BitPaletteSupport = true;
@@ -262,9 +248,9 @@
         // we could also look for GL_ATI_separate_stencil extension or
         // GL_EXT_stencil_two_side but they use different function signatures
         // than GL2.0+ (and than each other).
-        fCaps.fTwoSidedStencilSupport = (fGLVersion >= GR_GL_VER(2,0));
+        fCaps.fTwoSidedStencilSupport = (this->glVersion() >= GR_GL_VER(2,0));
         // supported on GL 1.4 and higher or by extension
-        fCaps.fStencilWrapOpsSupport = (fGLVersion >= GR_GL_VER(1,4)) ||
+        fCaps.fStencilWrapOpsSupport = (this->glVersion() >= GR_GL_VER(1,4)) ||
                                        this->hasExtension("GL_EXT_stencil_wrap");
     } else {
         // ES 2 has two sided stencil and stencil wrap
@@ -273,51 +259,6 @@
     }
 
     if (kDesktop_GrGLBinding == this->glBinding()) {
-        fGLCaps.fRGBA8RenderbufferSupport = true;
-    } else {
-        fGLCaps.fRGBA8RenderbufferSupport =
-                                    this->hasExtension("GL_OES_rgb8_rgba8") ||
-                                    this->hasExtension("GL_ARM_rgba8");
-    }
-
-
-    if (kDesktop_GrGLBinding == this->glBinding()) {
-        fGLCaps.fBGRAFormatSupport = this->glVersion() >= GR_GL_VER(1,2) ||
-                                     this->hasExtension("GL_EXT_bgra");
-    } else {
-        bool hasBGRAExt = false;
-        if (this->hasExtension("GL_APPLE_texture_format_BGRA8888")) {
-            fGLCaps.fBGRAFormatSupport = true;
-        } else if (this->hasExtension("GL_EXT_texture_format_BGRA8888")) {
-            fGLCaps.fBGRAFormatSupport = true;
-            fGLCaps.fBGRAIsInternalFormat = true;
-        }
-        GrAssert(fGLCaps.fBGRAFormatSupport ||
-                 kSkia8888_PM_GrPixelConfig != kBGRA_8888_PM_GrPixelConfig);
-    }
-
-    if (kDesktop_GrGLBinding == this->glBinding()) {
-        fGLCaps.fTextureSwizzleSupport = this->glVersion() >= GR_GL_VER(3,3) ||
-                                  this->hasExtension("GL_ARB_texture_swizzle");
-    } else {
-        fGLCaps.fTextureSwizzleSupport = false;
-    }
-
-    if (kDesktop_GrGLBinding == this->glBinding()) {
-        fGLCaps.fUnpackRowLengthSupport = true;
-        fGLCaps.fUnpackFlipYSupport = false;
-        fGLCaps.fPackRowLengthSupport = true;
-        fGLCaps.fPackFlipYSupport = false;
-    } else {
-        fGLCaps.fUnpackRowLengthSupport =this->hasExtension("GL_EXT_unpack_subimage");
-        fGLCaps.fUnpackFlipYSupport = this->hasExtension("GL_CHROMIUM_flipy");
-        // no extension for pack row length
-        fGLCaps.fPackRowLengthSupport = false;
-        fGLCaps.fPackFlipYSupport =
-            this->hasExtension("GL_ANGLE_pack_reverse_row_order");
-    }
-
-    if (kDesktop_GrGLBinding == this->glBinding()) {
         fCaps.fBufferLockSupport = true; // we require VBO support and the desktop VBO
                                          // extension includes glMapBuffer.
     } else {
@@ -325,7 +266,7 @@
     }
 
     if (kDesktop_GrGLBinding == this->glBinding()) {
-        if (fGLVersion >= GR_GL_VER(2,0) || 
+        if (this->glVersion() >= GR_GL_VER(2,0) || 
             this->hasExtension("GL_ARB_texture_non_power_of_two")) {
             fCaps.fNPOTTextureTileSupport = true;
         } else {
@@ -336,15 +277,6 @@
         fCaps.fNPOTTextureTileSupport = this->hasExtension("GL_OES_texture_npot");
     }
 
-    fGLCaps.fTextureUsageSupport = (kES2_GrGLBinding == this->glBinding()) &&
-                                   this->hasExtension("GL_ANGLE_texture_usage");
-
-    // Tex storage is in desktop 4.2 and can be an extension to desktop or ES.
-    fGLCaps.fTexStorageSupport = (kDesktop_GrGLBinding == this->glBinding() &&
-                                  fGLVersion >= GR_GL_VER(4,2)) ||
-                                 this->hasExtension("GL_ARB_texture_storage") ||
-                                 this->hasExtension("GL_EXT_texture_storage");
-
     fCaps.fHWAALineSupport = (kDesktop_GrGLBinding == this->glBinding());
 
     ////////////////////////////////////////////////////////////////////////////
@@ -352,103 +284,91 @@
     // TODO: Make these a preprocess that generate some compile time constants.
     // TODO: probe once at startup, rather than once per context creation.
 
-    GR_GL_GetIntegerv(fGL, GR_GL_MAX_TEXTURE_SIZE, &fCaps.fMaxTextureSize);
-    GR_GL_GetIntegerv(fGL, GR_GL_MAX_RENDERBUFFER_SIZE, &fCaps.fMaxRenderTargetSize);
+    GR_GL_GetIntegerv(gl, GR_GL_MAX_TEXTURE_SIZE, &fCaps.fMaxTextureSize);
+    GR_GL_GetIntegerv(gl, GR_GL_MAX_RENDERBUFFER_SIZE, &fCaps.fMaxRenderTargetSize);
     // Our render targets are always created with textures as the color
     // attachment, hence this min:
     fCaps.fMaxRenderTargetSize = GrMin(fCaps.fMaxTextureSize, fCaps.fMaxRenderTargetSize);
 
-    this->initFSAASupport();
-    this->initStencilFormats();
+    fCaps.fFSAASupport = GrGLCaps::kNone_MSFBOType != this->glCaps().msFBOType();
 }
 
-void GrGpuGL::initFSAASupport() {
-    // TODO: Get rid of GrAALevel and use # samples directly.
-    GR_STATIC_ASSERT(0 == kNone_GrAALevel);
-    GR_STATIC_ASSERT(1 == kLow_GrAALevel);
-    GR_STATIC_ASSERT(2 == kMed_GrAALevel);
-    GR_STATIC_ASSERT(3 == kHigh_GrAALevel);
-    memset(fGLCaps.fAASamples, 0, sizeof(fGLCaps.fAASamples));
+bool GrGpuGL::canPreserveReadWriteUnpremulPixels() {
+    if (kUnknown_CanPreserveUnpremulRoundtrip ==
+        fCanPreserveUnpremulRoundtrip) {
 
-    fGLCaps.fMSFBOType = GLCaps::kNone_MSFBO;
-    if (kDesktop_GrGLBinding != this->glBinding()) {
-       if (this->hasExtension("GL_CHROMIUM_framebuffer_multisample")) {
-           // chrome's extension is equivalent to the EXT msaa
-           // and fbo_blit extensions.
-            fGLCaps.fMSFBOType = GLCaps::kDesktopEXT_MSFBO;
-       } else if (this->hasExtension("GL_APPLE_framebuffer_multisample")) {
-            fGLCaps.fMSFBOType = GLCaps::kAppleES_MSFBO;
+        SkAutoTMalloc<uint32_t> data(256 * 256 * 3);
+        uint32_t* srcData = data.get();
+        uint32_t* firstRead = data.get() + 256 * 256;
+        uint32_t* secondRead = data.get() + 2 * 256 * 256;
+
+        for (int y = 0; y < 256; ++y) {
+            for (int x = 0; x < 256; ++x) {
+                uint8_t* color = reinterpret_cast<uint8_t*>(&srcData[256*y + x]);
+                color[3] = y;
+                color[2] = x;
+                color[1] = x;
+                color[0] = x;
+            }
         }
-    } else {
-        if ((fGLVersion >= GR_GL_VER(3,0)) || this->hasExtension("GL_ARB_framebuffer_object")) {
-            fGLCaps.fMSFBOType = GLCaps::kDesktopARB_MSFBO;
-        } else if (this->hasExtension("GL_EXT_framebuffer_multisample") &&
-                   this->hasExtension("GL_EXT_framebuffer_blit")) {
-            fGLCaps.fMSFBOType = GLCaps::kDesktopEXT_MSFBO;
+
+        // We have broader support for read/write pixels on render targets
+        // than on textures.
+        GrTextureDesc dstDesc;
+        dstDesc.fFlags = kRenderTarget_GrTextureFlagBit |
+                         kNoStencil_GrTextureFlagBit;
+        dstDesc.fWidth = 256;
+        dstDesc.fHeight = 256;
+        dstDesc.fConfig = kRGBA_8888_GrPixelConfig;
+        dstDesc.fSampleCnt = 0;
+
+        SkAutoTUnref<GrTexture> dstTex(this->createTexture(dstDesc, NULL, 0));
+        if (!dstTex.get()) {
+            return false;
         }
+        GrRenderTarget* rt = dstTex.get()->asRenderTarget();
+        GrAssert(NULL != rt);
+
+        bool failed = true;
+        static const UnpremulConversion gMethods[] = {
+            kUpOnWrite_DownOnRead_UnpremulConversion,
+            kDownOnWrite_UpOnRead_UnpremulConversion,
+        };
+
+        // pretend that we can do the roundtrip to avoid recursive calls to
+        // this function
+        fCanPreserveUnpremulRoundtrip = kYes_CanPreserveUnpremulRoundtrip;
+        for (size_t i = 0; i < GR_ARRAY_COUNT(gMethods) && failed; ++i) {
+            fUnpremulConversion = gMethods[i];
+            rt->writePixels(0, 0,
+                            256, 256,
+                            kRGBA_8888_UPM_GrPixelConfig, srcData, 0);
+            rt->readPixels(0, 0,
+                           256, 256,
+                           kRGBA_8888_UPM_GrPixelConfig, firstRead, 0);
+            rt->writePixels(0, 0,
+                            256, 256,
+                            kRGBA_8888_UPM_GrPixelConfig, firstRead, 0);
+            rt->readPixels(0, 0,
+                           256, 256,
+                           kRGBA_8888_UPM_GrPixelConfig, secondRead, 0);
+            failed = false;
+            for (int j = 0; j < 256 * 256; ++j) {
+                if (firstRead[j] != secondRead[j]) {
+                    failed = true;
+                    break;
+                }
+            }
+        }
+        fCanPreserveUnpremulRoundtrip = failed ? 
+                        kNo_CanPreserveUnpremulRoundtrip :
+                        kYes_CanPreserveUnpremulRoundtrip;
     }
 
-    if (GLCaps::kNone_MSFBO != fGLCaps.fMSFBOType) {
-        GrGLint maxSamples;
-        GR_GL_GetIntegerv(fGL, GR_GL_MAX_SAMPLES, &maxSamples);
-        if (maxSamples > 1 ) {
-            fGLCaps.fAASamples[kNone_GrAALevel] = 0;
-            fGLCaps.fAASamples[kLow_GrAALevel] =
-                GrMax(2, GrFixedFloorToInt((GR_FixedHalf) * maxSamples));
-            fGLCaps.fAASamples[kMed_GrAALevel] =
-                GrMax(2, GrFixedFloorToInt(((GR_Fixed1*3)/4) * maxSamples));
-            fGLCaps.fAASamples[kHigh_GrAALevel] = maxSamples;
-        }
-    }
-    fCaps.fFSAASupport = fGLCaps.fAASamples[kHigh_GrAALevel] > 0;
-}
-
-void GrGpuGL::initStencilFormats() {
-
-    // Build up list of legal stencil formats (though perhaps not supported on
-    // the particular gpu/driver) from most preferred to least.
-
-    // these consts are in order of most preferred to least preferred
-    // we don't bother with GL_STENCIL_INDEX1 or GL_DEPTH32F_STENCIL8
-    static const GrGLStencilBuffer::Format
-                  // internal Format      stencil bits      total bits        packed?
-        gS8    = {GR_GL_STENCIL_INDEX8,   8,                8,                false},
-        gS16   = {GR_GL_STENCIL_INDEX16,  16,               16,               false},
-        gD24S8 = {GR_GL_DEPTH24_STENCIL8, 8,                32,               true },
-        gS4    = {GR_GL_STENCIL_INDEX4,   4,                4,                false},
-        gS     = {GR_GL_STENCIL_INDEX,    kUnknownBitCount, kUnknownBitCount, false},
-        gDS    = {GR_GL_DEPTH_STENCIL,    kUnknownBitCount, kUnknownBitCount, true };
-
-    if (kDesktop_GrGLBinding == this->glBinding()) {
-        bool supportsPackedDS = fGLVersion >= GR_GL_VER(3,0) || 
-                                this->hasExtension("GL_EXT_packed_depth_stencil") ||
-                                this->hasExtension("GL_ARB_framebuffer_object");
-
-        // S1 thru S16 formats are in GL 3.0+, EXT_FBO, and ARB_FBO since we
-        // require FBO support we can expect these are legal formats and don't
-        // check. These also all support the unsized GL_STENCIL_INDEX.
-        fGLCaps.fStencilFormats.push_back() = gS8;
-        fGLCaps.fStencilFormats.push_back() = gS16;
-        if (supportsPackedDS) {
-            fGLCaps.fStencilFormats.push_back() = gD24S8;
-        }
-        fGLCaps.fStencilFormats.push_back() = gS4;
-        if (supportsPackedDS) {
-            fGLCaps.fStencilFormats.push_back() = gDS;
-        }
+    if (kYes_CanPreserveUnpremulRoundtrip == fCanPreserveUnpremulRoundtrip) {
+        return true;
     } else {
-        // ES2 has STENCIL_INDEX8 without extensions but requires extensions
-        // for other formats.
-        // ES doesn't support using the unsized format.
-
-        fGLCaps.fStencilFormats.push_back() = gS8;
-        //fStencilFormats.push_back() = gS16;
-        if (this->hasExtension("GL_OES_packed_depth_stencil")) {
-            fGLCaps.fStencilFormats.push_back() = gD24S8;
-        }
-        if (this->hasExtension("GL_OES_stencil4")) {
-            fGLCaps.fStencilFormats.push_back() = gS4;
-        }
+        return false;
     }
 }
 
@@ -476,7 +396,7 @@
     if (gPrintStartupSpew && !fPrintedCaps) {
         fPrintedCaps = true;
         this->getCaps().print();
-        fGLCaps.print();
+        this->glCaps().print();
     }
 
     // We detect cases when blending is effectively off
@@ -525,7 +445,7 @@
                                                   -GR_ScalarMax,
                                                   true);
         *fHWDrawState.sampler(s)->matrix() = GrMatrix::InvalidMatrix();
-        fHWDrawState.sampler(s)->setConvolutionParams(0, NULL, NULL);
+        fHWDrawState.sampler(s)->setConvolutionParams(0, NULL);
     }
 
     fHWBounds.fScissorRect.invalidate();
@@ -546,16 +466,16 @@
     fHWDrawState.setRenderTarget(NULL);
 
     // we assume these values
-    if (this->glCaps().fUnpackRowLengthSupport) {
+    if (this->glCaps().unpackRowLengthSupport()) {
         GL_CALL(PixelStorei(GR_GL_UNPACK_ROW_LENGTH, 0));
     }
-    if (this->glCaps().fPackRowLengthSupport) {
+    if (this->glCaps().packRowLengthSupport()) {
         GL_CALL(PixelStorei(GR_GL_PACK_ROW_LENGTH, 0));
     }
-    if (this->glCaps().fUnpackFlipYSupport) {
+    if (this->glCaps().unpackFlipYSupport()) {
         GL_CALL(PixelStorei(GR_GL_UNPACK_FLIP_Y, GR_GL_FALSE));
     }
-    if (this->glCaps().fPackFlipYSupport) {
+    if (this->glCaps().packFlipYSupport()) {
         GL_CALL(PixelStorei(GR_GL_PACK_REVERSE_ROW_ORDER, GR_GL_FALSE));
     }
 }
@@ -633,79 +553,6 @@
     return tgt;
 }
 
-GrResource* GrGpuGL::onCreatePlatformSurface(const GrPlatformSurfaceDesc& desc) {
-
-    bool isTexture = kTexture_GrPlatformSurfaceType == desc.fSurfaceType ||
-                     kTextureRenderTarget_GrPlatformSurfaceType == desc.fSurfaceType;
-    bool isRenderTarget = kRenderTarget_GrPlatformSurfaceType == desc.fSurfaceType ||
-                          kTextureRenderTarget_GrPlatformSurfaceType == desc.fSurfaceType;
-
-    GrGLRenderTarget::Desc rtDesc;
-    SkAutoTUnref<GrGLStencilBuffer> sb;
-
-    if (isRenderTarget) {
-        rtDesc.fRTFBOID = desc.fPlatformRenderTarget;
-        rtDesc.fConfig = desc.fConfig;
-        if (desc.fSampleCnt) {
-            if (kGrCanResolve_GrPlatformRenderTargetFlagBit  & desc.fRenderTargetFlags) {
-                rtDesc.fTexFBOID = desc.fPlatformResolveDestination;
-            } else {
-                GrAssert(!isTexture); // this should have been filtered by GrContext
-                rtDesc.fTexFBOID = GrGLRenderTarget::kUnresolvableFBOID;
-            }
-        } else {
-            rtDesc.fTexFBOID = desc.fPlatformRenderTarget;
-        }
-        // we don't know what the RB ids are without glGets and we don't care
-        // since we aren't responsible for deleting them.
-        rtDesc.fMSColorRenderbufferID = 0;
-        rtDesc.fSampleCnt = desc.fSampleCnt;
-        if (desc.fStencilBits) {
-            GrGLStencilBuffer::Format format;
-            format.fInternalFormat = GrGLStencilBuffer::kUnknownInternalFormat;
-            format.fPacked = false;
-            format.fStencilBits = desc.fStencilBits;
-            format.fTotalBits = desc.fStencilBits;
-            sb.reset(new GrGLStencilBuffer(this, 0, desc.fWidth, desc.fHeight,
-                                           rtDesc.fSampleCnt, format));
-        }
-        rtDesc.fOwnIDs = false;
-    }
-
-    if (isTexture) {
-        GrGLTexture::Desc texDesc;
-        if (!this->configToGLFormats(desc.fConfig, false, NULL, NULL, NULL)) {
-            return NULL;
-        }
-        texDesc.fWidth  = desc.fWidth;
-        texDesc.fHeight = desc.fHeight;
-
-        texDesc.fConfig             = desc.fConfig;
-        texDesc.fOrientation        = GrGLTexture::kBottomUp_Orientation;
-        texDesc.fTextureID          = desc.fPlatformTexture;
-        texDesc.fOwnsID             = false;
-        
-        if (isRenderTarget) {
-            GrTexture* tex = new GrGLTexture(this, texDesc, rtDesc);
-            tex->asRenderTarget()->setStencilBuffer(sb.get());
-            return tex;
-        } else {
-            return new GrGLTexture(this, texDesc);
-        }
-    } else {
-        GrGLIRect viewport;
-        viewport.fLeft   = 0;
-        viewport.fBottom = 0;
-        viewport.fWidth  = desc.fWidth;
-        viewport.fHeight = desc.fHeight;
-
-        GrGLRenderTarget* rt =  new GrGLRenderTarget(this, rtDesc, viewport);
-        rt->setStencilBuffer(sb.get());
-        return rt;
-    }
-}
-
-
 ////////////////////////////////////////////////////////////////////////////////
 
 void GrGpuGL::onWriteTexturePixels(GrTexture* texture,
@@ -778,7 +625,7 @@
     SkAutoSMalloc<128 * 128> tempStorage;
 
     bool useTexStorage = isNewTexture &&
-                         this->glCaps().fTexStorageSupport;
+                         this->glCaps().texStorageSupport();
     if (useTexStorage) {
         if (kDesktop_GrGLBinding == this->glBinding()) {
             // 565 is not a sized internal format on desktop GL. So on desktop
@@ -819,13 +666,13 @@
     bool glFlipY = false;
     if (NULL != data) {
         if (GrGLTexture::kBottomUp_Orientation == desc.fOrientation) {
-            if (this->glCaps().fUnpackFlipYSupport) {
+            if (this->glCaps().unpackFlipYSupport()) {
                 glFlipY = true;
             } else {
                 swFlipY = true;
             }
         }
-        if (this->glCaps().fUnpackRowLengthSupport && !swFlipY) {
+        if (this->glCaps().unpackRowLengthSupport() && !swFlipY) {
             // can't use this for flipping, only non-neg values allowed. :(
             if (rowBytes != trimRowBytes) {
                 GrGLint rowLength = static_cast<GrGLint>(rowBytes / bpp);
@@ -863,40 +710,39 @@
     if (isNewTexture && 
         0 == left && 0 == top &&
         desc.fWidth == width && desc.fHeight == height) {
-        GrGLClearErr(this->glInterface());
+        CLEAR_ERROR_BEFORE_ALLOC(this->glInterface());
         if (useTexStorage) {
             // We never resize  or change formats of textures. We don't use
             // mipmaps currently.
-            GR_GL_CALL_NOERRCHECK(this->glInterface(),
-                                  TexStorage2D(GR_GL_TEXTURE_2D,
-                                               1, // levels
-                                               internalFormat,
-                                               desc.fWidth, desc.fHeight));
+            GL_ALLOC_CALL(this->glInterface(),
+                          TexStorage2D(GR_GL_TEXTURE_2D,
+                                       1, // levels
+                                       internalFormat,
+                                       desc.fWidth, desc.fHeight));
         } else {
             if (GR_GL_PALETTE8_RGBA8 == internalFormat) {
                 GrGLsizei imageSize = desc.fWidth * desc.fHeight +
                                       kGrColorTableSize;
-                GR_GL_CALL_NOERRCHECK(this->glInterface(),
-                                      CompressedTexImage2D(GR_GL_TEXTURE_2D,
-                                                           0, // level
-                                                           internalFormat,
-                                                           desc.fWidth,
-                                                           desc.fHeight,
-                                                           0, // border
-                                                           imageSize,
-                                                           data));
+                GL_ALLOC_CALL(this->glInterface(),
+                              CompressedTexImage2D(GR_GL_TEXTURE_2D,
+                                                   0, // level
+                                                   internalFormat,
+                                                   desc.fWidth, desc.fHeight,
+                                                   0, // border
+                                                   imageSize,
+                                                   data));
             } else {
-                GR_GL_CALL_NOERRCHECK(this->glInterface(),
-                                      TexImage2D(GR_GL_TEXTURE_2D,
-                                                 0, // level
-                                                 internalFormat,
-                                                 desc.fWidth, desc.fHeight,
-                                                 0, // border
-                                                 externalFormat, externalType,
-                                                 data));
+                GL_ALLOC_CALL(this->glInterface(),
+                              TexImage2D(GR_GL_TEXTURE_2D,
+                                         0, // level
+                                         internalFormat,
+                                         desc.fWidth, desc.fHeight,
+                                         0, // border
+                                         externalFormat, externalType,
+                                         data));
             }
         }
-        GrGLenum error = GR_GL_GET_ERROR(this->glInterface());
+        GrGLenum error = CHECK_ALLOC_ERROR(this->glInterface());
         if (error != GR_GL_NO_ERROR) {
             succeeded = false;
         } else {
@@ -923,7 +769,7 @@
     }
 
     if (restoreGLRowLength) {
-        GrAssert(this->glCaps().fUnpackRowLengthSupport);
+        GrAssert(this->glCaps().unpackRowLengthSupport());
         GL_CALL(PixelStorei(GR_GL_UNPACK_ROW_LENGTH, 0));
     }
     if (glFlipY) {
@@ -954,7 +800,7 @@
     // If we are using multisampling we will create two FBOS. We render
     // to one and then resolve to the texture bound to the other.
     if (desc->fSampleCnt > 0) {
-        if (GLCaps::kNone_MSFBO == fGLCaps.fMSFBOType) {
+        if (GrGLCaps::kNone_MSFBOType == this->glCaps().msFBOType()) {
             goto FAILED;
         }
         GL_CALL(GenFramebuffers(1, &desc->fRTFBOID));
@@ -977,13 +823,13 @@
         GrAssert(desc->fSampleCnt > 1);
         GL_CALL(BindRenderbuffer(GR_GL_RENDERBUFFER,
                                desc->fMSColorRenderbufferID));
-        GrGLClearErr(this->glInterface());
-        GR_GL_CALL_NOERRCHECK(this->glInterface(),
-                              RenderbufferStorageMultisample(GR_GL_RENDERBUFFER, 
-                                                             desc->fSampleCnt,
-                                                             msColorFormat,
-                                                             width, height));
-        err = GR_GL_GET_ERROR(this->glInterface());
+        CLEAR_ERROR_BEFORE_ALLOC(this->glInterface());
+        GL_ALLOC_CALL(this->glInterface(),
+                      RenderbufferStorageMultisample(GR_GL_RENDERBUFFER, 
+                                                     desc->fSampleCnt,
+                                                     msColorFormat,
+                                                     width, height));
+        err = CHECK_ALLOC_ERROR(this->glInterface());
         if (err != GR_GL_NO_ERROR) {
             goto FAILED;
         }
@@ -992,9 +838,13 @@
                                       GR_GL_COLOR_ATTACHMENT0,
                                       GR_GL_RENDERBUFFER,
                                       desc->fMSColorRenderbufferID));
-        GL_CALL_RET(status, CheckFramebufferStatus(GR_GL_FRAMEBUFFER));
-        if (status != GR_GL_FRAMEBUFFER_COMPLETE) {
-            goto FAILED;
+        if (!this->glCaps().isConfigVerifiedColorAttachment(desc->fConfig)) {
+            GL_CALL_RET(status, CheckFramebufferStatus(GR_GL_FRAMEBUFFER));
+            if (status != GR_GL_FRAMEBUFFER_COMPLETE) {
+                goto FAILED;
+            }
+            fGLContextInfo.caps().markConfigAsValidColorAttachment(
+                                                                desc->fConfig);
         }
     }
     GL_CALL(BindFramebuffer(GR_GL_FRAMEBUFFER, desc->fTexFBOID));
@@ -1003,9 +853,12 @@
                                  GR_GL_COLOR_ATTACHMENT0,
                                  GR_GL_TEXTURE_2D,
                                  texID, 0));
-    GL_CALL_RET(status, CheckFramebufferStatus(GR_GL_FRAMEBUFFER));
-    if (status != GR_GL_FRAMEBUFFER_COMPLETE) {
-        goto FAILED;
+    if (!this->glCaps().isConfigVerifiedColorAttachment(desc->fConfig)) {
+        GL_CALL_RET(status, CheckFramebufferStatus(GR_GL_FRAMEBUFFER));
+        if (status != GR_GL_FRAMEBUFFER_COMPLETE) {
+            goto FAILED;
+        }
+        fGLContextInfo.caps().markConfigAsValidColorAttachment(desc->fConfig);
     }
 
     return true;
@@ -1046,10 +899,13 @@
     GrGLTexture::Desc glTexDesc;
     GrGLRenderTarget::Desc  glRTDesc;
 
+    // Attempt to catch un- or wrongly initialized sample counts;
+    GrAssert(desc.fSampleCnt >= 0 && desc.fSampleCnt <= 64);
+
     glTexDesc.fWidth  = desc.fWidth;
     glTexDesc.fHeight = desc.fHeight;
-    glTexDesc.fConfig        = desc.fConfig;
-    glTexDesc.fOwnsID        = true;
+    glTexDesc.fConfig = desc.fConfig;
+    glTexDesc.fOwnsID = true;
 
     glRTDesc.fMSColorRenderbufferID = 0;
     glRTDesc.fRTFBOID = 0;
@@ -1067,11 +923,10 @@
     glTexDesc.fOrientation = renderTarget ? GrGLTexture::kBottomUp_Orientation :
                                             GrGLTexture::kTopDown_Orientation;
 
-    GrAssert(as_size_t(desc.fAALevel) < GR_ARRAY_COUNT(fGLCaps.fAASamples));
-    glRTDesc.fSampleCnt = fGLCaps.fAASamples[desc.fAALevel];
-    if (GLCaps::kNone_MSFBO == fGLCaps.fMSFBOType &&
-        desc.fAALevel != kNone_GrAALevel) {
-        GrPrintf("AA RT requested but not supported on this platform.");
+    glRTDesc.fSampleCnt = desc.fSampleCnt;
+    if (GrGLCaps::kNone_MSFBOType == this->glCaps().msFBOType() &&
+        desc.fSampleCnt) {
+        GrPrintf("MSAA RT requested but not supported on this platform.");
     }
 
     if (renderTarget) {
@@ -1082,7 +937,7 @@
     }
 
     GL_CALL(GenTextures(1, &glTexDesc.fTextureID));
-    if (renderTarget && this->glCaps().fTextureUsageSupport) {
+    if (renderTarget && this->glCaps().textureUsageSupport()) {
         // provides a hint about how this texture will be used
         GL_CALL(TexParameteri(GR_GL_TEXTURE_2D,
                               GR_GL_TEXTURE_USAGE,
@@ -1148,6 +1003,9 @@
 }
 
 namespace {
+
+const GrGLuint kUnknownBitCount = GrGLStencilBuffer::kUnknownBitCount;
+
 void inline get_stencil_rb_sizes(const GrGLInterface* gl,
                                  GrGLuint rb, 
                                  GrGLStencilBuffer::Format* format) {
@@ -1188,33 +1046,32 @@
 
     GrGLStencilBuffer* sb = NULL;
 
-    int stencilFmtCnt = fGLCaps.fStencilFormats.count();
+    int stencilFmtCnt = this->glCaps().stencilFormats().count();
     for (int i = 0; i < stencilFmtCnt; ++i) {
         GL_CALL(BindRenderbuffer(GR_GL_RENDERBUFFER, sbID));
         // we start with the last stencil format that succeeded in hopes
         // that we won't go through this loop more than once after the
         // first (painful) stencil creation.
         int sIdx = (i + fLastSuccessfulStencilFmtIdx) % stencilFmtCnt;
-        const GrGLStencilBuffer::Format& sFmt = fGLCaps.fStencilFormats[sIdx];
-        GrGLClearErr(this->glInterface());
+        const GrGLCaps::StencilFormat& sFmt =
+                this->glCaps().stencilFormats()[sIdx];
+        CLEAR_ERROR_BEFORE_ALLOC(this->glInterface());
         // we do this "if" so that we don't call the multisample
         // version on a GL that doesn't have an MSAA extension.
         if (samples > 1) {
-            GR_GL_CALL_NOERRCHECK(this->glInterface(),
-                                  RenderbufferStorageMultisample(
-                                        GR_GL_RENDERBUFFER,
-                                        samples,
-                                        sFmt.fInternalFormat,
-                                        width,
-                                        height));
+            GL_ALLOC_CALL(this->glInterface(),
+                          RenderbufferStorageMultisample(GR_GL_RENDERBUFFER,
+                                                         samples,
+                                                         sFmt.fInternalFormat,
+                                                         width, height));
         } else {
-            GR_GL_CALL_NOERRCHECK(this->glInterface(),
-                                  RenderbufferStorage(GR_GL_RENDERBUFFER,
-                                                      sFmt.fInternalFormat,
-                                                      width, height));
+            GL_ALLOC_CALL(this->glInterface(),
+                          RenderbufferStorage(GR_GL_RENDERBUFFER,
+                                              sFmt.fInternalFormat,
+                                              width, height));
         }
 
-        GrGLenum err = GR_GL_GET_ERROR(this->glInterface());
+        GrGLenum err = CHECK_ALLOC_ERROR(this->glInterface());
         if (err == GR_GL_NO_ERROR) {
             // After sized formats we attempt an unsized format and take whatever
             // sizes GL gives us. In that case we query for the size.
@@ -1277,20 +1134,26 @@
         }
 
         GrGLenum status;
-        GL_CALL_RET(status, CheckFramebufferStatus(GR_GL_FRAMEBUFFER));
-        if (status != GR_GL_FRAMEBUFFER_COMPLETE) {
-            GL_CALL(FramebufferRenderbuffer(GR_GL_FRAMEBUFFER,
-                                          GR_GL_STENCIL_ATTACHMENT,
-                                          GR_GL_RENDERBUFFER, 0));
-            if (glsb->format().fPacked) {
+        if (!this->glCaps().isColorConfigAndStencilFormatVerified(rt->config(),
+                                                           glsb->format())) {
+            GL_CALL_RET(status, CheckFramebufferStatus(GR_GL_FRAMEBUFFER));
+            if (status != GR_GL_FRAMEBUFFER_COMPLETE) {
                 GL_CALL(FramebufferRenderbuffer(GR_GL_FRAMEBUFFER,
-                                              GR_GL_DEPTH_ATTACHMENT,
+                                              GR_GL_STENCIL_ATTACHMENT,
                                               GR_GL_RENDERBUFFER, 0));
+                if (glsb->format().fPacked) {
+                    GL_CALL(FramebufferRenderbuffer(GR_GL_FRAMEBUFFER,
+                                                  GR_GL_DEPTH_ATTACHMENT,
+                                                  GR_GL_RENDERBUFFER, 0));
+                }
+                return false;
+            } else {
+                fGLContextInfo.caps().markColorConfigAndStencilFormatAsVerified(
+                    rt->config(),
+                    glsb->format());
             }
-            return false;
-        } else {
-            return true;
         }
+        return true;
     }
 }
 
@@ -1302,12 +1165,15 @@
     if (id) {
         GL_CALL(BindBuffer(GR_GL_ARRAY_BUFFER, id));
         fHWGeometryState.fArrayPtrsDirty = true;
-        GrGLClearErr(this->glInterface());
+        CLEAR_ERROR_BEFORE_ALLOC(this->glInterface());
         // make sure driver can allocate memory for this buffer
-        GR_GL_CALL_NOERRCHECK(this->glInterface(),
-                              BufferData(GR_GL_ARRAY_BUFFER, size, NULL, 
-                              dynamic ? GR_GL_DYNAMIC_DRAW : GR_GL_STATIC_DRAW));
-        if (GR_GL_GET_ERROR(this->glInterface()) != GR_GL_NO_ERROR) {
+        GL_ALLOC_CALL(this->glInterface(),
+                      BufferData(GR_GL_ARRAY_BUFFER,
+                                 size,
+                                 NULL,   // data ptr
+                                 dynamic ? GR_GL_DYNAMIC_DRAW :
+                                           GR_GL_STATIC_DRAW));
+        if (CHECK_ALLOC_ERROR(this->glInterface()) != GR_GL_NO_ERROR) {
             GL_CALL(DeleteBuffers(1, &id));
             // deleting bound buffer does implicit bind to 0
             fHWGeometryState.fVertexBuffer = NULL;
@@ -1326,12 +1192,15 @@
     GL_CALL(GenBuffers(1, &id));
     if (id) {
         GL_CALL(BindBuffer(GR_GL_ELEMENT_ARRAY_BUFFER, id));
-        GrGLClearErr(this->glInterface());
+        CLEAR_ERROR_BEFORE_ALLOC(this->glInterface());
         // make sure driver can allocate memory for this buffer
-        GR_GL_CALL_NOERRCHECK(this->glInterface(),
-                              BufferData(GR_GL_ELEMENT_ARRAY_BUFFER, size, NULL,
-                              dynamic ? GR_GL_DYNAMIC_DRAW : GR_GL_STATIC_DRAW));
-        if (GR_GL_GET_ERROR(this->glInterface()) != GR_GL_NO_ERROR) {
+        GL_ALLOC_CALL(this->glInterface(),
+                      BufferData(GR_GL_ELEMENT_ARRAY_BUFFER,
+                                 size,
+                                 NULL,  // data ptr
+                                 dynamic ? GR_GL_DYNAMIC_DRAW :
+                                           GR_GL_STATIC_DRAW));
+        if (CHECK_ALLOC_ERROR(this->glInterface()) != GR_GL_NO_ERROR) {
             GL_CALL(DeleteBuffers(1, &id));
             // deleting bound buffer does implicit bind to 0
             fHWGeometryState.fIndexBuffer = NULL;
@@ -1477,13 +1346,13 @@
                                         GrPixelConfig config,
                                         size_t rowBytes) const {
     // if GL can do the flip then we'll never pay for it.
-    if (this->glCaps().fPackFlipYSupport) {
+    if (this->glCaps().packFlipYSupport()) {
         return false;
     }
 
     // If we have to do memcpy to handle non-trim rowBytes then we
     // get the flip for free. Otherwise it costs.
-    if (this->glCaps().fPackRowLengthSupport) {
+    if (this->glCaps().packRowLengthSupport()) {
         return true;
     }
     // If we have to do memcpys to handle rowBytes then y-flip is free
@@ -1528,7 +1397,7 @@
             this->flushRenderTarget(&GrIRect::EmptyIRect());
             break;
         case GrGLRenderTarget::kCanResolve_ResolveType:
-            this->resolveRenderTarget(tgt);
+            this->onResolveRenderTarget(tgt);
             // we don't track the state of the READ FBO ID.
             GL_CALL(BindFramebuffer(GR_GL_READ_FRAMEBUFFER,
                                     tgt->textureFBOID()));
@@ -1554,7 +1423,7 @@
     // a scratch buffer.
     SkAutoSMalloc<32 * sizeof(GrColor)> scratch;
     if (rowBytes != tightRowBytes) {
-        if (this->glCaps().fPackRowLengthSupport) {
+        if (this->glCaps().packRowLengthSupport()) {
             GrAssert(!(rowBytes % sizeof(GrColor)));
             GL_CALL(PixelStorei(GR_GL_PACK_ROW_LENGTH, rowBytes / sizeof(GrColor)));
             readDstRowBytes = rowBytes;
@@ -1563,17 +1432,17 @@
             readDst = scratch.get();
         }
     }
-    if (!invertY && this->glCaps().fPackFlipYSupport) {
+    if (!invertY && this->glCaps().packFlipYSupport()) {
         GL_CALL(PixelStorei(GR_GL_PACK_REVERSE_ROW_ORDER, 1));
     }
     GL_CALL(ReadPixels(readRect.fLeft, readRect.fBottom,
                        readRect.fWidth, readRect.fHeight,
                        format, type, readDst));
     if (readDstRowBytes != tightRowBytes) {
-        GrAssert(this->glCaps().fPackRowLengthSupport);
+        GrAssert(this->glCaps().packRowLengthSupport());
         GL_CALL(PixelStorei(GR_GL_PACK_ROW_LENGTH, 0));
     }
-    if (!invertY && this->glCaps().fPackFlipYSupport) {
+    if (!invertY && this->glCaps().packFlipYSupport()) {
         GL_CALL(PixelStorei(GR_GL_PACK_REVERSE_ROW_ORDER, 0));
         invertY = true;
     }
@@ -1742,10 +1611,12 @@
 #endif
 }
 
-void GrGpuGL::resolveRenderTarget(GrGLRenderTarget* rt) {
+void GrGpuGL::onResolveRenderTarget(GrRenderTarget* target) {
+
+    GrGLRenderTarget* rt = static_cast<GrGLRenderTarget*>(target);
 
     if (rt->needsResolve()) {
-        GrAssert(GLCaps::kNone_MSFBO != fGLCaps.fMSFBOType);
+        GrAssert(GrGLCaps::kNone_MSFBOType != this->glCaps().msFBOType());
         GrAssert(rt->textureFBOID() != rt->renderFBOID());
         GL_CALL(BindFramebuffer(GR_GL_READ_FRAMEBUFFER,
                                 rt->renderFBOID()));
@@ -1763,7 +1634,7 @@
         r.setRelativeTo(vp, dirtyRect.fLeft, dirtyRect.fTop, 
                         dirtyRect.width(), dirtyRect.height());
 
-        if (GLCaps::kAppleES_MSFBO == fGLCaps.fMSFBOType) {
+        if (GrGLCaps::kAppleES_MSFBOType == this->glCaps().msFBOType()) {
             // Apple's extension uses the scissor as the blit bounds.
             GL_CALL(Enable(GR_GL_SCISSOR_TEST));
             GL_CALL(Scissor(r.fLeft, r.fBottom,
@@ -1772,9 +1643,10 @@
             fHWBounds.fScissorRect.invalidate();
             fHWBounds.fScissorEnabled = true;
         } else {
-            if (GLCaps::kDesktopARB_MSFBO != fGLCaps.fMSFBOType) {
+            if (GrGLCaps::kDesktopARB_MSFBOType != this->glCaps().msFBOType()) {
                 // this respects the scissor during the blit, so disable it.
-                GrAssert(GLCaps::kDesktopEXT_MSFBO == fGLCaps.fMSFBOType);
+                GrAssert(GrGLCaps::kDesktopEXT_MSFBOType ==
+                         this->glCaps().msFBOType());
                 this->flushScissor(NULL);
             }
             int right = r.fLeft + r.fWidth;
@@ -1850,7 +1722,7 @@
 
         if (settings->isDisabled()) {
             if (stencilClip) {
-                settings = &gClipStencilSettings;
+                settings = GetClipStencilSettings();
             }
         }
 
@@ -2063,6 +1935,8 @@
             return GR_GL_LINEAR;
         case GrSamplerState::kNearest_Filter:
         case GrSamplerState::kConvolution_Filter:
+        case GrSamplerState::kErode_Filter:
+        case GrSamplerState::kDilate_Filter:
             return GR_GL_NEAREST;
         default:
             GrAssert(!"Unknown filter type");
@@ -2126,7 +2000,7 @@
             GrGLRenderTarget* texRT = 
                 static_cast<GrGLRenderTarget*>(nextTexture->asRenderTarget());
             if (NULL != texRT) {
-                resolveRenderTarget(texRT);
+                this->onResolveRenderTarget(texRT);
             }
 
             if (fHWDrawState.getTexture(s) != nextTexture) {
@@ -2178,7 +2052,7 @@
                                         GR_GL_TEXTURE_WRAP_T,
                                         newTexParams.fWrapT));
             }
-            if (this->glCaps().fTextureSwizzleSupport &&
+            if (this->glCaps().textureSwizzleSupport() &&
                 (setAll ||
                  memcmp(newTexParams.fSwizzleRGBA,
                         oldTexParams.fSwizzleRGBA,
@@ -2340,10 +2214,10 @@
             break;
         case kBGRA_8888_PM_GrPixelConfig:
         case kBGRA_8888_UPM_GrPixelConfig:
-            if (!fGLCaps.fBGRAFormatSupport) {
+            if (!this->glCaps().bgraFormatSupport()) {
                 return false;
             }
-            if (fGLCaps.fBGRAIsInternalFormat) {
+            if (this->glCaps().bgraIsInternalFormat()) {
                 if (getSizedInternalFormat) {
                     *internalFormat = GR_GL_BGRA8;
                 } else {
@@ -2496,45 +2370,7 @@
 int GrGpuGL::getMaxEdges() const {
     // FIXME:  This is a pessimistic estimate based on how many other things
     // want to add uniforms.  This should be centralized somewhere.
-    return GR_CT_MIN(fGLCaps.fMaxFragmentUniformVectors - 8,
+    return GR_CT_MIN(this->glCaps().maxFragmentUniformVectors() - 8,
                      GrDrawState::kMaxEdges);
 }
 
-void GrGpuGL::GLCaps::print() const {
-    for (int i = 0; i < fStencilFormats.count(); ++i) {
-        GrPrintf("Stencil Format %d, stencil bits: %02d, total bits: %02d\n",
-                 i,
-                 fStencilFormats[i].fStencilBits,
-                 fStencilFormats[i].fTotalBits);
-    }
-
-    GR_STATIC_ASSERT(0 == kNone_MSFBO);
-    GR_STATIC_ASSERT(1 == kDesktopARB_MSFBO);
-    GR_STATIC_ASSERT(2 == kDesktopEXT_MSFBO);
-    GR_STATIC_ASSERT(3 == kAppleES_MSFBO);
-    static const char* gMSFBOExtStr[] = {
-        "None",
-        "ARB",
-        "EXT",
-        "Apple",
-    };
-    GrPrintf("MSAA Type: %s\n", gMSFBOExtStr[fMSFBOType]);
-    for (int i = 0; i < (int)GR_ARRAY_COUNT(fAASamples); ++i) {
-        GrPrintf("AA Level %d has %d samples\n", i, fAASamples[i]);
-    }
-    GrPrintf("Max FS Uniform Vectors: %d\n", fMaxFragmentUniformVectors);
-    GrPrintf("Support RGBA8 Render Buffer: %s\n",
-             (fRGBA8RenderbufferSupport ? "YES": "NO"));
-    GrPrintf("BGRA is an internal format: %s\n",
-             (fBGRAIsInternalFormat ? "YES": "NO"));
-    GrPrintf("Support texture swizzle: %s\n",
-             (fTextureSwizzleSupport ? "YES": "NO"));
-    GrPrintf("Unpack Row length support: %s\n",
-             (fUnpackRowLengthSupport ? "YES": "NO"));
-    GrPrintf("Unpack Flip Y support: %s\n",
-             (fUnpackFlipYSupport ? "YES": "NO"));
-    GrPrintf("Pack Row length support: %s\n",
-             (fPackRowLengthSupport ? "YES": "NO"));
-    GrPrintf("Pack Flip Y support: %s\n",
-             (fPackFlipYSupport ? "YES": "NO"));
-}
diff --git a/src/gpu/GrGpuGL.h b/src/gpu/gl/GrGpuGL.h
similarity index 71%
rename from src/gpu/GrGpuGL.h
rename to src/gpu/gl/GrGpuGL.h
index 36fead1..398a2fc 100644
--- a/src/gpu/GrGpuGL.h
+++ b/src/gpu/gl/GrGpuGL.h
@@ -11,23 +11,27 @@
 #ifndef GrGpuGL_DEFINED
 #define GrGpuGL_DEFINED
 
-#include "GrDrawState.h"
-#include "GrGpu.h"
+#include "../GrDrawState.h"
+#include "../GrGpu.h"
+#include "GrGLContextInfo.h"
 #include "GrGLIndexBuffer.h"
 #include "GrGLIRect.h"
 #include "GrGLStencilBuffer.h"
 #include "GrGLTexture.h"
 #include "GrGLVertexBuffer.h"
 
-#include "SkString.h"
-
 class GrGpuGL : public GrGpu {
 public:
     virtual ~GrGpuGL();
 
-    const GrGLInterface* glInterface() const { return fGL; }
-    GrGLBinding glBinding() const { return fGLBinding; }
-    GrGLVersion glVersion() const { return fGLVersion; }
+    const GrGLInterface* glInterface() const { 
+        return fGLContextInfo.interface();
+    }
+    GrGLBinding glBinding() const { return fGLContextInfo.binding(); }
+    GrGLVersion glVersion() const { return fGLContextInfo.version(); }
+    GrGLSLGeneration glslGeneration() const {
+        return fGLContextInfo.glslGeneration();
+    }
 
     // GrGpu overrides
     virtual GrPixelConfig preferredReadPixelsConfig(GrPixelConfig config)
@@ -41,89 +45,12 @@
                                     GrPixelConfig config,
                                     size_t rowBytes) const SK_OVERRIDE;
     virtual bool fullReadPixelsIsFasterThanPartial() const SK_OVERRIDE;
+
+    virtual bool canPreserveReadWriteUnpremulPixels() SK_OVERRIDE;
+
 protected:
-    GrGpuGL(const GrGLInterface* glInterface, GrGLBinding glBinding);
+    GrGpuGL(const GrGLContextInfo& ctxInfo);
 
-    struct GLCaps {
-        GLCaps()
-            // make defaults be the most restrictive
-            : fStencilFormats(8) // prealloc space for    stencil formats
-            , fMSFBOType(kNone_MSFBO)
-            , fMaxFragmentUniformVectors(0)
-            , fRGBA8RenderbufferSupport(false)
-            , fBGRAFormatSupport(false)
-            , fBGRAIsInternalFormat(false)
-            , fTextureSwizzleSupport(false)
-            , fUnpackRowLengthSupport(false)
-            , fUnpackFlipYSupport(false)
-            , fPackRowLengthSupport(false)
-            , fPackFlipYSupport(false)
-            , fTextureUsageSupport(false)
-            , fTexStorageSupport(false) {
-            memset(fAASamples, 0, sizeof(fAASamples));
-        }
-        SkTArray<GrGLStencilBuffer::Format, true> fStencilFormats;
-
-        enum {
-            /**
-             * no support for MSAA FBOs
-             */
-            kNone_MSFBO = 0,  
-            /**
-             * GL3.0-style MSAA FBO (GL_ARB_framebuffer_object)
-             */
-            kDesktopARB_MSFBO,
-            /**
-             * earlier GL_EXT_framebuffer* extensions
-             */
-            kDesktopEXT_MSFBO,
-            /**
-             * GL_APPLE_framebuffer_multisample ES extension
-             */
-            kAppleES_MSFBO,
-        } fMSFBOType;
-
-        // TODO: get rid of GrAALevel and use sample cnt directly
-        GrGLuint fAASamples[4];
-
-        // The maximum number of fragment uniform vectors (GLES has min. 16).
-        int fMaxFragmentUniformVectors;
-
-        // ES requires an extension to support RGBA8 in RenderBufferStorage
-        bool fRGBA8RenderbufferSupport;
-
-        // Is GL_BGRA supported
-        bool fBGRAFormatSupport;
-
-        // Depending on the ES extensions present the BGRA external format may
-        // correspond either a BGRA or RGBA internalFormat. On desktop GL it is
-        // RGBA
-        bool fBGRAIsInternalFormat;
-
-        // GL_ARB_texture_swizzle support
-        bool fTextureSwizzleSupport;
-    
-        // Is there support for GL_UNPACK_ROW_LENGTH
-        bool fUnpackRowLengthSupport;
-
-        // Is there support for GL_UNPACK_FLIP_Y
-        bool fUnpackFlipYSupport;
-
-        // Is there support for GL_PACK_ROW_LENGTH
-        bool fPackRowLengthSupport;
-
-        // Is there support for GL_PACK_REVERSE_ROW_ORDER
-        bool fPackFlipYSupport;
-
-        // Is there support for texture parameter GL_TEXTURE_USAGE
-        bool fTextureUsageSupport;
-
-        // Is there support for glTexStorage
-        bool fTexStorageSupport;
-
-        void print() const;
-    } fGLCaps;
- 
     struct {
         size_t                  fVertexOffset;
         GrVertexLayout          fVertexLayout;
@@ -137,6 +64,11 @@
         bool fSmoothLineEnabled;
     } fHWAAState;
 
+    enum UnpremulConversion {
+        kUpOnWrite_DownOnRead_UnpremulConversion,
+        kDownOnWrite_UpOnRead_UnpremulConversion
+    } fUnpremulConversion;
+
     GrDrawState fHWDrawState;
     bool        fHWStencilClip;
 
@@ -161,7 +93,7 @@
         GrGLIRect   fViewportRect;
     } fHWBounds;
 
-    const GLCaps& glCaps() const { return fGLCaps; }
+    const GrGLCaps& glCaps() const { return fGLContextInfo.caps(); }
 
     // GrGpu overrides
     virtual void onResetContext() SK_OVERRIDE;
@@ -173,7 +105,6 @@
                                                  bool dynamic);
     virtual GrIndexBuffer* onCreateIndexBuffer(uint32_t size,
                                                bool dynamic);
-    virtual GrResource* onCreatePlatformSurface(const GrPlatformSurfaceDesc& desc);
     virtual GrTexture* onCreatePlatformTexture(const GrPlatformTextureDesc& desc) SK_OVERRIDE;
     virtual GrRenderTarget* onCreatePlatformRenderTarget(const GrPlatformRenderTargetDesc& desc) SK_OVERRIDE;
     virtual bool createStencilBufferForRenderTarget(GrRenderTarget* rt,
@@ -198,6 +129,9 @@
                                       GrPixelConfig config, const void* buffer,
                                       size_t rowBytes) SK_OVERRIDE;
 
+    virtual void onResolveRenderTarget(GrRenderTarget* target) SK_OVERRIDE;
+
+
     virtual void onGpuDrawIndexed(GrPrimitiveType type,
                                   uint32_t startVertex,
                                   uint32_t startIndex,
@@ -237,10 +171,12 @@
                     GrBlendCoeff srcCoeff,
                     GrBlendCoeff dstCoeff);
 
-    bool hasExtension(const char* ext) {
-        return GrGLHasExtensionFromString(ext, fExtensionString.c_str());
+    bool hasExtension(const char* ext) const {
+        return fGLContextInfo.hasExtension(ext);
     }
 
+    const GrGLContextInfo& glContextInfo() const { return fGLContextInfo; }
+
     // adjusts texture matrix to account for orientation
     static void AdjustTextureMatrix(const GrGLTexture* texture,
                                     GrSamplerState::SampleMode mode,
@@ -255,8 +191,7 @@
     static bool BlendCoeffReferencesConstant(GrBlendCoeff coeff);
 
 private:
-    // Inits GrDrawTarget::Caps and GLCaps, sublcass may enable
-    // additional caps.
+    // Inits GrDrawTarget::Caps, sublcass may enable additional caps.
     void initCaps();
 
     void initFSAASupport();
@@ -281,8 +216,6 @@
     void flushStencil();
     void flushAAState(GrPrimitiveType type);
 
-    void resolveRenderTarget(GrGLRenderTarget* texture);
-
     bool configToGLFormats(GrPixelConfig config,
                            bool getSizedInternal,
                            GrGLenum* internalFormat,
@@ -305,9 +238,7 @@
     friend class GrGLTexture;
     friend class GrGLRenderTarget;
 
-    // read these once at begining and then never again
-    SkString fExtensionString;
-    GrGLVersion fGLVersion;
+    GrGLContextInfo fGLContextInfo;
 
     // we want to clear stencil buffers when they are created. We want to clear
     // the entire buffer even if it is larger than the color attachment. We
@@ -322,8 +253,11 @@
     // from our loop that tries stencil formats and calls check fb status.
     int fLastSuccessfulStencilFmtIdx;
 
-    const GrGLInterface* fGL;
-    GrGLBinding fGLBinding;
+    enum CanPreserveUnpremulRoundtrip {
+        kUnknown_CanPreserveUnpremulRoundtrip,
+        kNo_CanPreserveUnpremulRoundtrip,
+        kYes_CanPreserveUnpremulRoundtrip,
+    } fCanPreserveUnpremulRoundtrip;
 
     bool fPrintedCaps;
 
diff --git a/src/gpu/GrGpuGLShaders.cpp b/src/gpu/gl/GrGpuGLShaders.cpp
similarity index 84%
rename from src/gpu/GrGpuGLShaders.cpp
rename to src/gpu/gl/GrGpuGLShaders.cpp
index c203168..db7e3a7 100644
--- a/src/gpu/GrGpuGLShaders.cpp
+++ b/src/gpu/gl/GrGpuGLShaders.cpp
@@ -7,19 +7,19 @@
  */
 
 
-#include "GrBinHashKey.h"
+#include "../GrBinHashKey.h"
 #include "GrGLProgram.h"
 #include "GrGLSL.h"
 #include "GrGpuGLShaders.h"
-#include "GrGpuVertex.h"
+#include "../GrGpuVertex.h"
 #include "GrNoncopyable.h"
-#include "GrStringBuilder.h"
-#include "GrRandom.h"
+#include "../GrStringBuilder.h"
+#include "../GrRandom.h"
 
 #define SKIP_CACHE_CHECK    true
 #define GR_UINT32_MAX   static_cast<uint32_t>(-1)
 
-#include "GrTHashCache.h"
+#include "../GrTHashCache.h"
 
 class GrGpuGLShaders::ProgramCache : public ::GrNoncopyable {
 private:
@@ -55,21 +55,19 @@
     Entry                       fEntries[kMaxEntries];
     int                         fCount;
     unsigned int                fCurrLRUStamp;
-    const GrGLInterface*        fGL;
-    GrGLSLGeneration            fGLSLGeneration;
+    const GrGLContextInfo&      fGL;
 
 public:
-    ProgramCache(const GrGLInterface* gl,
-                 GrGLSLGeneration glslGeneration) 
+    ProgramCache(const GrGLContextInfo& gl)
         : fCount(0)
         , fCurrLRUStamp(0)
-        , fGL(gl)
-        , fGLSLGeneration(glslGeneration) {
+        , fGL(gl) {
     }
 
     ~ProgramCache() {
         for (int i = 0; i < fCount; ++i) {
-            GrGpuGLShaders::DeleteProgram(fGL, &fEntries[i].fProgramData);
+            GrGpuGLShaders::DeleteProgram(fGL.interface(),
+                                          &fEntries[i].fProgramData);
         }
     }
 
@@ -90,8 +88,7 @@
         
         Entry* entry = fHashCache.find(newEntry.fKey);
         if (NULL == entry) {
-            if (!desc.genProgram(fGL, fGLSLGeneration,
-                                 &newEntry.fProgramData)) {
+            if (!desc.genProgram(fGL, &newEntry.fProgramData)) {
                 return NULL;
             }
             if (fCount < kMaxEntries) {
@@ -106,7 +103,8 @@
                     }
                 }
                 fHashCache.remove(entry->fKey, entry);
-                GrGpuGLShaders::DeleteProgram(fGL, &entry->fProgramData);
+                GrGpuGLShaders::DeleteProgram(fGL.interface(),
+                                              &entry->fProgramData);
             }
             entry->copyAndTakeOwnership(newEntry);
             fHashCache.insert(entry->fKey, entry);
@@ -166,7 +164,7 @@
 bool GrGpuGLShaders::programUnitTest() {
 
     GrGLSLGeneration glslGeneration = 
-            GetGLSLGeneration(this->glBinding(), this->glInterface());
+            GrGetGLSLGeneration(this->glBinding(), this->glInterface());
     static const int STAGE_OPTS[] = {
         0,
         StageDesc::kNoPerspective_OptFlagBit,
@@ -175,8 +173,9 @@
     static const int IN_CONFIG_FLAGS[] = {
         StageDesc::kNone_InConfigFlag,
         StageDesc::kSwapRAndB_InConfigFlag,
-        StageDesc::kSwapRAndB_InConfigFlag | StageDesc::kMulRGBByAlpha_InConfigFlag,
-        StageDesc::kMulRGBByAlpha_InConfigFlag,
+        StageDesc::kSwapRAndB_InConfigFlag |
+        StageDesc::kMulRGBByAlpha_RoundUp_InConfigFlag,
+        StageDesc::kMulRGBByAlpha_RoundDown_InConfigFlag,
         StageDesc::kSmearAlpha_InConfigFlag,
     };
     GrGLProgram program;
@@ -198,6 +197,7 @@
         pdesc.fVertexLayout = 0;
         pdesc.fEmitsPointSize = random.nextF() > .5f;
         pdesc.fColorInput = random_int(&random, ProgramDesc::kColorInputCnt);
+        pdesc.fCoverageInput = random_int(&random, ProgramDesc::kColorInputCnt);
 
         pdesc.fColorFilterXfermode = random_int(&random, SkXfermode::kCoeffModesCnt);
 
@@ -211,7 +211,7 @@
         pdesc.fExperimentalGS = this->getCaps().fGeometryShaderSupport &&
                                 random_bool(&random);
 #endif
-        pdesc.fOutputPM =  random_int(&random, ProgramDesc::kOutputPMCnt);
+        pdesc.fOutputConfig =  random_int(&random, ProgramDesc::kOutputConfigCnt);
 
         bool edgeAA = random_bool(&random);
         if (edgeAA) {
@@ -219,9 +219,7 @@
             if (vertexEdgeAA) {
                 pdesc.fVertexLayout |= GrDrawTarget::kEdge_VertexLayoutBit;
                 if (this->getCaps().fShaderDerivativeSupport) {
-                    pdesc.fVertexEdgeType = random_bool(&random) ?
-                        GrDrawState::kHairQuad_EdgeType :
-                        GrDrawState::kHairLine_EdgeType;
+                    pdesc.fVertexEdgeType = (GrDrawState::VertexEdgeType) random_int(&random, GrDrawState::kVertexEdgeTypeCnt);
                 } else {
                     pdesc.fVertexEdgeType = GrDrawState::kHairLine_EdgeType;
                 }
@@ -263,28 +261,33 @@
             stage.fCoordMapping =  random_int(&random, StageDesc::kCoordMappingCnt);
             stage.fFetchMode = random_int(&random, StageDesc::kFetchModeCnt);
             // convolution shaders don't work with persp tex matrix
-            if (stage.fFetchMode == StageDesc::kConvolution_FetchMode) {
+            if (stage.fFetchMode == StageDesc::kConvolution_FetchMode ||
+                stage.fFetchMode == StageDesc::kDilate_FetchMode ||
+                stage.fFetchMode == StageDesc::kErode_FetchMode) {
                 stage.fOptFlags |= StageDesc::kNoPerspective_OptFlagBit;
             }
             stage.setEnabled(VertexUsesStage(s, pdesc.fVertexLayout));
+            static const uint32_t kMulByAlphaMask =
+                StageDesc::kMulRGBByAlpha_RoundUp_InConfigFlag |
+                StageDesc::kMulRGBByAlpha_RoundDown_InConfigFlag;
             switch (stage.fFetchMode) {
                 case StageDesc::kSingle_FetchMode:
                     stage.fKernelWidth = 0;
                     break;
                 case StageDesc::kConvolution_FetchMode:
+                case StageDesc::kDilate_FetchMode:
+                case StageDesc::kErode_FetchMode:
                     stage.fKernelWidth = random_int(&random, 2, 8);
-                    stage.fInConfigFlags &= ~StageDesc::kMulRGBByAlpha_InConfigFlag;
+                    stage.fInConfigFlags &= ~kMulByAlphaMask;
                     break;
                 case StageDesc::k2x2_FetchMode:
                     stage.fKernelWidth = 0;
-                    stage.fInConfigFlags &= ~StageDesc::kMulRGBByAlpha_InConfigFlag;
+                    stage.fInConfigFlags &= ~kMulByAlphaMask;
                     break;
             }
         }
         CachedData cachedData;
-        if (!program.genProgram(this->glInterface(),
-                                glslGeneration,
-                                &cachedData)) {
+        if (!program.genProgram(this->glContextInfo(), &cachedData)) {
             return false;
         }
         DeleteProgram(this->glInterface(), &cachedData);
@@ -292,25 +295,10 @@
     return true;
 }
 
-namespace {
-GrGLBinding get_binding_in_use(const GrGLInterface* gl) {
-    if (gl->supportsDesktop()) {
-        return kDesktop_GrGLBinding;
-    } else {
-        GrAssert(gl->supportsES2());
-        return kES2_GrGLBinding;
-    }
-}
-}
-
-GrGpuGLShaders::GrGpuGLShaders(const GrGLInterface* gl)
-    : GrGpuGL(gl, get_binding_in_use(gl)) {
-
-    GrGLSLGeneration glslGeneration =
-        GetGLSLGeneration(this->glBinding(), gl);
+GrGpuGLShaders::GrGpuGLShaders(const GrGLContextInfo& ctxInfo)
+    : GrGpuGL(ctxInfo) {
 
     // Enable supported shader-related caps
-    fCaps.fSupportPerVertexCoverage = true;
     if (kDesktop_GrGLBinding == this->glBinding()) {
         fCaps.fDualSourceBlendingSupport =
                             this->glVersion() >= GR_GL_VER(3,3) ||
@@ -319,16 +307,18 @@
         // we don't support GL_ARB_geometry_shader4, just GL 3.2+ GS
         fCaps.fGeometryShaderSupport = 
                                 this->glVersion() >= GR_GL_VER(3,2) &&
-                                glslGeneration >= k150_GLSLGeneration;
+                                this->glslGeneration() >= k150_GrGLSLGeneration;
     } else {
         fCaps.fShaderDerivativeSupport =
                             this->hasExtension("GL_OES_standard_derivatives");
     }
 
-    GR_GL_GetIntegerv(gl, GR_GL_MAX_VERTEX_ATTRIBS, &fMaxVertexAttribs);
+    GR_GL_GetIntegerv(this->glInterface(),
+                      GR_GL_MAX_VERTEX_ATTRIBS,
+                      &fMaxVertexAttribs);
 
     fProgramData = NULL;
-    fProgramCache = new ProgramCache(gl, glslGeneration);
+    fProgramCache = new ProgramCache(this->glContextInfo());
 
 #if 0
     this->programUnitTest();
@@ -574,7 +564,20 @@
     }
     int imageIncrementUni = fProgramData->fUniLocations.fStages[s].fImageIncrementUni;
     if (GrGLProgram::kUnusedUniform != imageIncrementUni) {
-        GL_CALL(Uniform2fv(imageIncrementUni, 1, sampler.getImageIncrement()));
+        const GrGLTexture* texture =
+            static_cast<const GrGLTexture*>(this->getDrawState().getTexture(s));
+        float imageIncrement[2] = { 0 };
+        switch (sampler.getFilterDirection()) {
+            case GrSamplerState::kX_FilterDirection:
+                imageIncrement[0] = 1.0f / texture->width();
+                break;
+            case GrSamplerState::kY_FilterDirection:
+                imageIncrement[1] = 1.0f / texture->height();
+                break;
+            default:
+                GrCrash("Unknown filter direction.");
+        }
+        GL_CALL(Uniform2fv(imageIncrementUni, 1, imageIncrement));
     }
 }
 
@@ -656,7 +659,8 @@
         switch (desc.fColorInput) {
             case ProgramDesc::kAttribute_ColorInput:
                 if (fHWDrawState.getColor() != color) {
-                    // OpenGL ES only supports the float varities of glVertexAttrib
+                    // OpenGL ES only supports the float varieties of
+                    // glVertexAttrib
                     float c[] = GR_COLOR_TO_VEC4(color);
                     GL_CALL(VertexAttrib4fv(GrGLProgram::ColorAttributeIdx(), 
                                             c));
@@ -665,7 +669,8 @@
                 break;
             case ProgramDesc::kUniform_ColorInput:
                 if (fProgramData->fColor != color) {
-                    // OpenGL ES only supports the float varities of glVertexAttrib
+                    // OpenGL ES doesn't support unsigned byte varieties of
+                    // glUniform
                     float c[] = GR_COLOR_TO_VEC4(color);
                     GrAssert(GrGLProgram::kUnusedUniform != 
                              fProgramData->fUniLocations.fColorUni);
@@ -691,6 +696,47 @@
     }
 }
 
+void GrGpuGLShaders::flushCoverage(GrColor coverage) {
+    const ProgramDesc& desc = fCurrentProgram.getDesc();
+    const GrDrawState& drawState = this->getDrawState();
+
+
+    if (this->getGeomSrc().fVertexLayout & kCoverage_VertexLayoutBit) {
+        // coverage will be specified per-vertex as an attribute
+        // invalidate the const vertex attrib coverage
+        fHWDrawState.setCoverage4(GrColor_ILLEGAL);
+    } else {
+        switch (desc.fCoverageInput) {
+            case ProgramDesc::kAttribute_ColorInput:
+                if (fHWDrawState.getCoverage() != coverage) {
+                    // OpenGL ES only supports the float varieties of
+                    // glVertexAttrib
+                    float c[] = GR_COLOR_TO_VEC4(coverage);
+                    GL_CALL(VertexAttrib4fv(GrGLProgram::CoverageAttributeIdx(), 
+                                            c));
+                    fHWDrawState.setCoverage(coverage);
+                }
+                break;
+            case ProgramDesc::kUniform_ColorInput:
+                if (fProgramData->fCoverage != coverage) {
+                    // OpenGL ES doesn't support unsigned byte varieties of
+                    // glUniform
+                    float c[] = GR_COLOR_TO_VEC4(coverage);
+                    GrAssert(GrGLProgram::kUnusedUniform != 
+                             fProgramData->fUniLocations.fCoverageUni);
+                    GL_CALL(Uniform4fv(fProgramData->fUniLocations.fCoverageUni,
+                                        1, c));
+                    fProgramData->fCoverage = coverage;
+                }
+                break;
+            case ProgramDesc::kSolidWhite_ColorInput:
+            case ProgramDesc::kTransBlack_ColorInput:
+                break;
+            default:
+                GrCrash("Unknown coverage type.");
+        }
+    }
+}
 
 bool GrGpuGLShaders::flushGraphicsState(GrPrimitiveType type) {
     if (!flushGLStateCommon(type)) {
@@ -729,14 +775,19 @@
     this->flushBlend(type, srcCoeff, dstCoeff);
 
     GrColor color;
+    GrColor coverage;
     if (blendOpts & kEmitTransBlack_BlendOptFlag) {
         color = 0;
+        coverage = 0;
     } else if (blendOpts & kEmitCoverage_BlendOptFlag) {
         color = 0xffffffff;
+        coverage = drawState.getCoverage();
     } else {
         color = drawState.getColor();
+        coverage = drawState.getCoverage();
     }
     this->flushColor(color);
+    this->flushCoverage(coverage);
 
     this->flushViewMatrix();
 
@@ -865,15 +916,14 @@
     }
 
     if (newCoverageOffset > 0) {
-        // bind a single channel, they should all have the same value.
         GrGLvoid* coverageOffset = (int8_t*)(vertexOffset + newCoverageOffset);
         int idx = GrGLProgram::CoverageAttributeIdx();
         if (oldCoverageOffset <= 0) {
             GL_CALL(EnableVertexAttribArray(idx));
-            GL_CALL(VertexAttribPointer(idx, 1, GR_GL_UNSIGNED_BYTE,
+            GL_CALL(VertexAttribPointer(idx, 4, GR_GL_UNSIGNED_BYTE,
                                         true, newStride, coverageOffset));
         } else if (allOffsetsChange || newCoverageOffset != oldCoverageOffset) {
-            GL_CALL(VertexAttribPointer(idx, 1, GR_GL_UNSIGNED_BYTE,
+            GL_CALL(VertexAttribPointer(idx, 4, GR_GL_UNSIGNED_BYTE,
                                         true, newStride, coverageOffset));
         }
     } else if (oldCoverageOffset > 0) {
@@ -925,10 +975,14 @@
 
     bool requiresAttributeColors = 
         !skipColor && SkToBool(desc.fVertexLayout & kColor_VertexLayoutBit);
-    // fColorInput records how colors are specified for the program. Strip
-    // the bit from the layout to avoid false negatives when searching for an
-    // existing program in the cache.
-    desc.fVertexLayout &= ~(kColor_VertexLayoutBit);
+    bool requiresAttributeCoverage = 
+        !skipCoverage && SkToBool(desc.fVertexLayout &
+                                  kCoverage_VertexLayoutBit);
+
+    // fColorInput/fCoverageInput records how colors are specified for the.
+    // program. So we strip the bits from the layout to avoid false negatives
+    // when searching for an existing program in the cache.
+    desc.fVertexLayout &= ~(kColor_VertexLayoutBit | kCoverage_VertexLayoutBit);
 
     desc.fColorFilterXfermode = skipColor ?
                                 SkXfermode::kDst_Mode :
@@ -956,6 +1010,19 @@
     } else {
         desc.fColorInput = ProgramDesc::kAttribute_ColorInput;
     }
+    
+    bool covIsSolidWhite = !requiresAttributeCoverage &&
+                           0xffffffff == drawState.getCoverage();
+    
+    if (skipCoverage) {
+        desc.fCoverageInput = ProgramDesc::kTransBlack_ColorInput;
+    } else if (covIsSolidWhite) {
+        desc.fCoverageInput = ProgramDesc::kSolidWhite_ColorInput;
+    } else if (GR_GL_NO_CONSTANT_ATTRIBUTES && !requiresAttributeCoverage) {
+        desc.fCoverageInput = ProgramDesc::kUniform_ColorInput;
+    } else {
+        desc.fCoverageInput = ProgramDesc::kAttribute_ColorInput;
+    }
 
     desc.fEdgeAANumEdges = skipCoverage ? 0 : drawState.getNumAAEdges();
     desc.fEdgeAAConcave = desc.fEdgeAANumEdges > 0 &&
@@ -1031,6 +1098,12 @@
                 case GrSamplerState::kConvolution_Filter:
                     stage.fFetchMode = StageDesc::kConvolution_FetchMode;
                     break;
+                case GrSamplerState::kDilate_Filter:
+                    stage.fFetchMode = StageDesc::kDilate_FetchMode;
+                    break;
+                case GrSamplerState::kErode_Filter:
+                    stage.fFetchMode = StageDesc::kErode_FetchMode;
+                    break;
                 default:
                     GrCrash("Unexpected filter!");
                     break;
@@ -1045,7 +1118,7 @@
             }
 
             stage.fInConfigFlags = 0;
-            if (!this->glCaps().fTextureSwizzleSupport) {
+            if (!this->glCaps().textureSwizzleSupport()) {
                 if (GrPixelConfigIsAlphaOnly(texture->config())) {
                     // if we don't have texture swizzle support then
                     // the shader must do an alpha smear after reading
@@ -1056,10 +1129,22 @@
                 }
             }
             if (GrPixelConfigIsUnpremultiplied(texture->config())) {
-                stage.fInConfigFlags |= StageDesc::kMulRGBByAlpha_InConfigFlag;
+                // The shader generator assumes that color channels are bytes
+                // when rounding.
+                GrAssert(4 == GrBytesPerPixel(texture->config()));
+                if (kUpOnWrite_DownOnRead_UnpremulConversion ==
+                    fUnpremulConversion) {
+                    stage.fInConfigFlags |=
+                        StageDesc::kMulRGBByAlpha_RoundDown_InConfigFlag;
+                } else {
+                    stage.fInConfigFlags |=
+                        StageDesc::kMulRGBByAlpha_RoundUp_InConfigFlag;
+                }
             }
 
-            if (sampler.getFilter() == GrSamplerState::kConvolution_Filter) {
+            if (sampler.getFilter() == GrSamplerState::kConvolution_Filter ||
+                sampler.getFilter() == GrSamplerState::kDilate_Filter ||
+                sampler.getFilter() == GrSamplerState::kErode_Filter) {
                 stage.fKernelWidth = sampler.getKernelWidth();
             } else {
                 stage.fKernelWidth = 0;
@@ -1074,9 +1159,18 @@
     }
 
     if (GrPixelConfigIsUnpremultiplied(drawState.getRenderTarget()->config())) {
-        desc.fOutputPM = ProgramDesc::kNo_OutputPM;
+        // The shader generator assumes that color channels are bytes
+        // when rounding.
+        GrAssert(4 == GrBytesPerPixel(drawState.getRenderTarget()->config()));
+        if (kUpOnWrite_DownOnRead_UnpremulConversion == fUnpremulConversion) {
+            desc.fOutputConfig =
+                ProgramDesc::kUnpremultiplied_RoundUp_OutputConfig;
+        } else {
+            desc.fOutputConfig =
+                ProgramDesc::kUnpremultiplied_RoundDown_OutputConfig;
+        }
     } else {
-        desc.fOutputPM = ProgramDesc::kYes_OutputPM;
+        desc.fOutputConfig = ProgramDesc::kPremultiplied_OutputConfig;
     }
 
     desc.fDualSrcOutput = ProgramDesc::kNone_DualSrcOutput;
@@ -1104,7 +1198,7 @@
     if (!hasCoverage) {
         hasCoverage =
                desc.fEdgeAANumEdges ||
-               (desc.fVertexLayout & GrDrawTarget::kCoverage_VertexLayoutBit) ||
+               requiresAttributeCoverage ||
                (desc.fVertexLayout & GrDrawTarget::kEdge_VertexLayoutBit);
     }
 
diff --git a/src/gpu/GrGpuGLShaders.h b/src/gpu/gl/GrGpuGLShaders.h
similarity index 92%
rename from src/gpu/GrGpuGLShaders.h
rename to src/gpu/gl/GrGpuGLShaders.h
index 4b972b5..39bc974 100644
--- a/src/gpu/GrGpuGLShaders.h
+++ b/src/gpu/gl/GrGpuGLShaders.h
@@ -19,7 +19,7 @@
 // Programmable OpenGL or OpenGL ES 2.0
 class GrGpuGLShaders : public GrGpuGL {
 public:
-             GrGpuGLShaders(const GrGLInterface* glInterface);
+             GrGpuGLShaders(const GrGLContextInfo& ctxInfo);
     virtual ~GrGpuGLShaders();
 
     virtual void abandonResources();
@@ -57,9 +57,12 @@
     // sets the texture domain uniform for currently bound program
     void flushTextureDomain(int stage);
 
-    // sets the color specified by GrDrawTarget::setColor()
+    // sets the color specified by GrDrawState::setColor()
     void flushColor(GrColor color);
 
+    // sets the color specified by GrDrawState::setCoverage()
+    void flushCoverage(GrColor color);
+
     // sets the MVP matrix uniform for currently bound program
     void flushViewMatrix();
 
diff --git a/src/gpu/SkGLContext.cpp b/src/gpu/gl/SkGLContext.cpp
similarity index 60%
rename from src/gpu/SkGLContext.cpp
rename to src/gpu/gl/SkGLContext.cpp
index f6b7db8..525afe8 100644
--- a/src/gpu/SkGLContext.cpp
+++ b/src/gpu/gl/SkGLContext.cpp
@@ -5,7 +5,7 @@
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  */
-#include "SkGLContext.h"
+#include "gl/SkGLContext.h"
 
 SkGLContext::SkGLContext()
     : fFBO(0)
@@ -16,6 +16,10 @@
     SkSafeUnref(fGL);
 }
 
+bool SkGLContext::hasExtension(const char* extensionName) const {
+    return GrGLHasExtensionFromString(extensionName, fExtensionString.c_str());
+}
+
 bool SkGLContext::init(int width, int height) {
     if (fGL) {
         fGL->unref();
@@ -24,18 +28,30 @@
 
     fGL = this->createGLContext();
     if (fGL) {
+        fExtensionString =
+            reinterpret_cast<const char*>(SK_GL(*this,
+                                                 GetString(GR_GL_EXTENSIONS)));
+        const char* versionStr =
+            reinterpret_cast<const char*>(SK_GL(*this,
+                                                GetString(GR_GL_VERSION)));
+        GrGLVersion version = GrGLGetVersionFromString(versionStr);
+
         // clear any existing GL erorrs
         GrGLenum error;
         do {
             error = SK_GL(*this, GetError());
         } while (GR_GL_NO_ERROR != error);
+
         GrGLuint cbID;
         GrGLuint dsID;
+
+        GrGLBinding bindingInUse = GrGLGetBindingInUse(this->gl());
+
         SK_GL(*this, GenFramebuffers(1, &fFBO));
         SK_GL(*this, BindFramebuffer(GR_GL_FRAMEBUFFER, fFBO));
         SK_GL(*this, GenRenderbuffers(1, &cbID));
         SK_GL(*this, BindRenderbuffer(GR_GL_RENDERBUFFER, cbID));
-        if (fGL->supportsES2()) {
+        if (kES2_GrGLBinding == bindingInUse) {
             SK_GL(*this, RenderbufferStorage(GR_GL_RENDERBUFFER,
                                              GR_GL_RGBA8,
                                              width, height));
@@ -50,18 +66,40 @@
                                              cbID));
         SK_GL(*this, GenRenderbuffers(1, &dsID));
         SK_GL(*this, BindRenderbuffer(GR_GL_RENDERBUFFER, dsID));
-        if (fGL->supportsES2()) {
-            SK_GL(*this, RenderbufferStorage(GR_GL_RENDERBUFFER,
-                                             GR_GL_STENCIL_INDEX8,
-                                             width, height));
+
+        // Some drivers that support packed depth stencil will only succeed
+        // in binding a packed format an FBO. However, we can't rely on packed
+        // depth stencil being available.
+        bool supportsPackedDepthStencil;
+        if (kES2_GrGLBinding == bindingInUse) {
+            supportsPackedDepthStencil = 
+                    this->hasExtension("GL_OES_packed_depth_stencil");
         } else {
+            supportsPackedDepthStencil = version >= GR_GL_VER(3,0) ||
+                    this->hasExtension("GL_EXT_packed_depth_stencil") ||
+                    this->hasExtension("GL_ARB_framebuffer_object");
+        }
+
+        if (supportsPackedDepthStencil) {
+            // ES2 requires sized internal formats for RenderbufferStorage
+            // On Desktop we let the driver decide.
+            GrGLenum format = kES2_GrGLBinding == bindingInUse ? 
+                                    GR_GL_DEPTH24_STENCIL8 :
+                                    GR_GL_DEPTH_STENCIL;
             SK_GL(*this, RenderbufferStorage(GR_GL_RENDERBUFFER,
-                                             GR_GL_DEPTH_STENCIL,
+                                             format,
                                              width, height));
             SK_GL(*this, FramebufferRenderbuffer(GR_GL_FRAMEBUFFER,
                                                  GR_GL_DEPTH_ATTACHMENT,
                                                  GR_GL_RENDERBUFFER,
                                                  dsID));
+        } else {
+            GrGLenum format = kES2_GrGLBinding == bindingInUse ? 
+                                    GR_GL_STENCIL_INDEX8 :
+                                    GR_GL_STENCIL_INDEX;
+            SK_GL(*this, RenderbufferStorage(GR_GL_RENDERBUFFER,
+                                             format,
+                                             width, height));
         }
         SK_GL(*this, FramebufferRenderbuffer(GR_GL_FRAMEBUFFER,
                                              GR_GL_STENCIL_ATTACHMENT,
diff --git a/src/gpu/SkNullGLContext.cpp b/src/gpu/gl/SkNullGLContext.cpp
similarity index 88%
rename from src/gpu/SkNullGLContext.cpp
rename to src/gpu/gl/SkNullGLContext.cpp
index 04e63d8..86c09b2 100644
--- a/src/gpu/SkNullGLContext.cpp
+++ b/src/gpu/gl/SkNullGLContext.cpp
@@ -6,7 +6,7 @@
  * found in the LICENSE file.
  */
 
-#include "SkNullGLContext.h"
+#include "gl/SkNullGLContext.h"
 
 const GrGLInterface* SkNullGLContext::createGLContext() {
     return GrGLCreateNullInterface();
diff --git a/src/gpu/gr_hello_world.cpp b/src/gpu/gr_hello_world.cpp
index b475fb8..b19f9b4 100644
--- a/src/gpu/gr_hello_world.cpp
+++ b/src/gpu/gr_hello_world.cpp
@@ -5,10 +5,10 @@
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  */
-#include "SkGLCanvas.h"
+#include "gl/SkGLCanvas.h"
 #include "SkBitmap.h"
 #include "SkPaint.h"
-#include "SkGpuGLShaders.h"
+#include "gl/SkGpuGLShaders.h"
 
 extern "C" {
     void gr_hello_world();
diff --git a/src/gpu/ios/GrGLDefaultInterface_iOS.cpp b/src/gpu/ios/GrGLDefaultInterface_iOS.cpp
index 189fb25..9fc953f 100644
--- a/src/gpu/ios/GrGLDefaultInterface_iOS.cpp
+++ b/src/gpu/ios/GrGLDefaultInterface_iOS.cpp
@@ -7,7 +7,7 @@
  */
 
 
-#include "GrGLInterface.h"
+#include "gl/GrGLInterface.h"
 
 #import <OpenGLES/ES2/gl.h>
 #import <OpenGLES/ES2/glext.h>
diff --git a/src/gpu/mac/GrGLCreateNativeInterface_mac.cpp b/src/gpu/mac/GrGLCreateNativeInterface_mac.cpp
index 51a4111..5ae214a 100644
--- a/src/gpu/mac/GrGLCreateNativeInterface_mac.cpp
+++ b/src/gpu/mac/GrGLCreateNativeInterface_mac.cpp
@@ -7,7 +7,7 @@
  */
 
 
-#include "GrGLInterface.h"
+#include "gl/GrGLInterface.h"
 
 #include <OpenGL/gl.h>
 #include <OpenGL/glext.h>
diff --git a/src/gpu/mac/SkNativeGLContext_mac.cpp b/src/gpu/mac/SkNativeGLContext_mac.cpp
index ad68c40..18b36a5 100644
--- a/src/gpu/mac/SkNativeGLContext_mac.cpp
+++ b/src/gpu/mac/SkNativeGLContext_mac.cpp
@@ -5,7 +5,7 @@
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  */
-#include "SkNativeGLContext.h"
+#include "gl/SkNativeGLContext.h"
 
 SkNativeGLContext::AutoContextRestore::AutoContextRestore() {
     fOldAGLContext = aglGetCurrentContext();
diff --git a/src/gpu/mesa/GrGLCreateMesaInterface.cpp b/src/gpu/mesa/GrGLCreateMesaInterface.cpp
index 7303d1b..4686438 100644
--- a/src/gpu/mesa/GrGLCreateMesaInterface.cpp
+++ b/src/gpu/mesa/GrGLCreateMesaInterface.cpp
@@ -7,7 +7,7 @@
  */
 
 
-#include "GrGLInterface.h"
+#include "gl/GrGLInterface.h"
 
 #define GL_GLEXT_PROTOTYPES
 #include <GL/osmesa.h>
diff --git a/src/gpu/mesa/SkMesaGLContext.cpp b/src/gpu/mesa/SkMesaGLContext.cpp
index b817fa7..c4f84cf 100644
--- a/src/gpu/mesa/SkMesaGLContext.cpp
+++ b/src/gpu/mesa/SkMesaGLContext.cpp
@@ -8,7 +8,7 @@
 
 #include <GL/osmesa.h>
 
-#include "SkMesaGLContext.h"
+#include "gl/SkMesaGLContext.h"
 
 SkMesaGLContext::AutoContextRestore::AutoContextRestore() {
     fOldContext = (Context)OSMesaGetCurrentContext();
diff --git a/src/gpu/unix/GrGLCreateNativeInterface_unix.cpp b/src/gpu/unix/GrGLCreateNativeInterface_unix.cpp
index ab0d351..1e9f2e0 100644
--- a/src/gpu/unix/GrGLCreateNativeInterface_unix.cpp
+++ b/src/gpu/unix/GrGLCreateNativeInterface_unix.cpp
@@ -7,7 +7,7 @@
  */
 
 
-#include "GrGLInterface.h"
+#include "gl/GrGLInterface.h"
 
 #include <GL/glx.h>
 #include <GL/gl.h>
diff --git a/src/gpu/unix/SkNativeGLContext_unix.cpp b/src/gpu/unix/SkNativeGLContext_unix.cpp
index 907e2b8..f4199eb 100644
--- a/src/gpu/unix/SkNativeGLContext_unix.cpp
+++ b/src/gpu/unix/SkNativeGLContext_unix.cpp
@@ -5,7 +5,7 @@
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  */
-#include "SkNativeGLContext.h"
+#include "gl/SkNativeGLContext.h"
 
 #include <GL/glu.h>
 
diff --git a/src/gpu/win/GrGLCreateNativeInterface_win.cpp b/src/gpu/win/GrGLCreateNativeInterface_win.cpp
index b59f930..f050a6f 100644
--- a/src/gpu/win/GrGLCreateNativeInterface_win.cpp
+++ b/src/gpu/win/GrGLCreateNativeInterface_win.cpp
@@ -7,7 +7,7 @@
  */
 
 
-#include "GrGLInterface.h"
+#include "gl/GrGLInterface.h"
 #define WIN32_LEAN_AND_MEAN
 #include <Windows.h>
 #include <GL/GL.h>
diff --git a/src/gpu/win/SkNativeGLContext_win.cpp b/src/gpu/win/SkNativeGLContext_win.cpp
index 5d518dd..9650bc1 100644
--- a/src/gpu/win/SkNativeGLContext_win.cpp
+++ b/src/gpu/win/SkNativeGLContext_win.cpp
@@ -6,7 +6,7 @@
  * found in the LICENSE file.
  */
 
-#include "SkNativeGLContext.h"
+#include "gl/SkNativeGLContext.h"
 
 #define WIN32_LEAN_AND_MEAN
 #include <Windows.h>
diff --git a/src/images/SkImageDecoder.cpp b/src/images/SkImageDecoder.cpp
index ee0d6cc..de0afbb 100644
--- a/src/images/SkImageDecoder.cpp
+++ b/src/images/SkImageDecoder.cpp
@@ -45,7 +45,7 @@
 SkImageDecoder::SkImageDecoder()
     : fReporter(NULL), fPeeker(NULL), fChooser(NULL), fAllocator(NULL),
       fSampleSize(1), fDefaultPref(SkBitmap::kNo_Config), fDitherImage(true),
-      fUsePrefTable(false) {
+      fUsePrefTable(false),fPreferQualityOverSpeed(false) {
 }
 
 SkImageDecoder::~SkImageDecoder() {
@@ -208,7 +208,13 @@
     }
     dest->setConfig(src->getConfig(), w, h);
     dest->setIsOpaque(src->isOpaque());
-    this->allocPixelRef(dest, NULL);
+
+    if (!this->allocPixelRef(dest, NULL)) {
+#ifdef SK_DEBUG
+        SkDebugf("failed to allocate pixels needed to crop the bitmap");
+#endif
+        return;
+    }
 
     SkCanvas canvas(*dest);
     canvas.drawBitmap(*src, (srcX - destX) / sampleSize,
diff --git a/src/images/SkImageRef.cpp b/src/images/SkImageRef.cpp
index 2d53f7e..1d6b270 100644
--- a/src/images/SkImageRef.cpp
+++ b/src/images/SkImageRef.cpp
@@ -16,7 +16,7 @@
 //#define DUMP_IMAGEREF_LIFECYCLE
 
 // can't be static, as SkImageRef_Pool needs to see it
-SkMutex gImageRefMutex;
+SK_DECLARE_GLOBAL_MUTEX(gImageRefMutex);
 
 ///////////////////////////////////////////////////////////////////////////////
 
diff --git a/src/images/SkImageRef_GlobalPool.cpp b/src/images/SkImageRef_GlobalPool.cpp
index 6ea42c1..b774023 100644
--- a/src/images/SkImageRef_GlobalPool.cpp
+++ b/src/images/SkImageRef_GlobalPool.cpp
@@ -9,25 +9,41 @@
 #include "SkImageRefPool.h"
 #include "SkThread.h"
 
-extern SkMutex gImageRefMutex;
+extern SkBaseMutex gImageRefMutex;
 
-static SkImageRefPool gGlobalImageRefPool;
+/*
+ *  This returns the lazily-allocated global pool. It must be called
+ *  from inside the guard mutex, so we safely only ever allocate 1.
+ */
+static SkImageRefPool* GetGlobalPool() {
+    static SkImageRefPool* gPool;
+    if (NULL == gPool) {
+        gPool = SkNEW(SkImageRefPool);
+        // call sk_atexit(...) when we have that, to free the global pool
+    }
+    return gPool;
+}
 
 SkImageRef_GlobalPool::SkImageRef_GlobalPool(SkStream* stream,
                                              SkBitmap::Config config,
                                              int sampleSize)
         : SkImageRef(stream, config, sampleSize) {
     this->mutex()->acquire();
-    gGlobalImageRefPool.addToHead(this);
+    GetGlobalPool()->addToHead(this);
     this->mutex()->release();
 }
 
 SkImageRef_GlobalPool::~SkImageRef_GlobalPool() {
     this->mutex()->acquire();
-    gGlobalImageRefPool.detach(this);
+    GetGlobalPool()->detach(this);
     this->mutex()->release();
 }
-    
+
+/*  By design, onUnlockPixels() already is inside the mutex-lock,
+ *  and it is the (indirect) caller of onDecode(), therefore we can assume
+ *  that we also are already inside the mutex. Hence, we can reference
+ *  the global-pool directly.
+ */
 bool SkImageRef_GlobalPool::onDecode(SkImageDecoder* codec, SkStream* stream,
                                      SkBitmap* bitmap, SkBitmap::Config config,
                                      SkImageDecoder::Mode mode) {
@@ -35,7 +51,8 @@
         return false;
     }
     if (mode == SkImageDecoder::kDecodePixels_Mode) {
-        gGlobalImageRefPool.justAddedPixels(this);
+        // no need to grab the mutex here, it has already been acquired.
+        GetGlobalPool()->justAddedPixels(this);
     }
     return true;
 }
@@ -43,13 +60,14 @@
 void SkImageRef_GlobalPool::onUnlockPixels() {
     this->INHERITED::onUnlockPixels();
     
-    gGlobalImageRefPool.canLosePixels(this);
+    // by design, onUnlockPixels() already is inside the mutex-lock
+    GetGlobalPool()->canLosePixels(this);
 }
 
 SkImageRef_GlobalPool::SkImageRef_GlobalPool(SkFlattenableReadBuffer& buffer)
         : INHERITED(buffer) {
     this->mutex()->acquire();
-    gGlobalImageRefPool.addToHead(this);
+    GetGlobalPool()->addToHead(this);
     this->mutex()->release();
 }
 
@@ -64,25 +82,25 @@
 
 size_t SkImageRef_GlobalPool::GetRAMBudget() {
     SkAutoMutexAcquire ac(gImageRefMutex);
-    return gGlobalImageRefPool.getRAMBudget();
+    return GetGlobalPool()->getRAMBudget();
 }
 
 void SkImageRef_GlobalPool::SetRAMBudget(size_t size) {
     SkAutoMutexAcquire ac(gImageRefMutex);
-    gGlobalImageRefPool.setRAMBudget(size);
+    GetGlobalPool()->setRAMBudget(size);
 }
 
 size_t SkImageRef_GlobalPool::GetRAMUsed() {
     SkAutoMutexAcquire ac(gImageRefMutex);    
-    return gGlobalImageRefPool.getRAMUsed();
+    return GetGlobalPool()->getRAMUsed();
 }
 
 void SkImageRef_GlobalPool::SetRAMUsed(size_t usage) {
     SkAutoMutexAcquire ac(gImageRefMutex);
-    gGlobalImageRefPool.setRAMUsed(usage);
+    GetGlobalPool()->setRAMUsed(usage);
 }
 
 void SkImageRef_GlobalPool::DumpPool() {
     SkAutoMutexAcquire ac(gImageRefMutex);
-    gGlobalImageRefPool.dump();
+    GetGlobalPool()->dump();
 }
diff --git a/src/opts/SkBitmapProcState_opts_SSE2.cpp b/src/opts/SkBitmapProcState_opts_SSE2.cpp
index 9a0a013..1852c66 100644
--- a/src/opts/SkBitmapProcState_opts_SSE2.cpp
+++ b/src/opts/SkBitmapProcState_opts_SSE2.cpp
@@ -232,3 +232,403 @@
         *colors++ = _mm_cvtsi128_si32(sum);
     } while (--count > 0);
 }
+
+static inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max,
+                                                 SkFixed one) {
+    unsigned i = SkClampMax(f >> 16, max);
+    i = (i << 4) | ((f >> 12) & 0xF);
+    return (i << 14) | SkClampMax((f + one) >> 16, max);
+}
+
+/*  SSE version of ClampX_ClampY_filter_scale()
+ *  portable version is in core/SkBitmapProcState_matrix.h
+ */
+void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
+                                     int count, int x, int y) {
+    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
+                             SkMatrix::kScale_Mask)) == 0);
+    SkASSERT(s.fInvKy == 0);
+    
+    const unsigned maxX = s.fBitmap->width() - 1;
+    const SkFixed one = s.fFilterOneX;
+    const SkFixed dx = s.fInvSx;
+    SkFixed fx;
+
+    SkPoint pt;
+    s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
+                                SkIntToScalar(y) + SK_ScalarHalf, &pt);
+    const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
+    const unsigned maxY = s.fBitmap->height() - 1;
+    // compute our two Y values up front
+    *xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY);
+    // now initialize fx
+    fx = SkScalarToFixed(pt.fX) - (one >> 1);
+
+    // test if we don't need to apply the tile proc
+    if (dx > 0 && (unsigned)(fx >> 16) <= maxX &&
+        (unsigned)((fx + dx * (count - 1)) >> 16) < maxX) {
+        if (count >= 4) {
+            // SSE version of decal_filter_scale
+            while ((size_t(xy) & 0x0F) != 0) {
+                SkASSERT((fx >> (16 + 14)) == 0);
+                *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
+                fx += dx;
+                count--;
+            }
+
+            __m128i wide_1    = _mm_set1_epi32(1);
+            __m128i wide_dx4  = _mm_set1_epi32(dx * 4);
+            __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
+                                              fx + dx, fx);
+
+            while (count >= 4) {
+                __m128i wide_out; 
+    
+                wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14);
+                wide_out = _mm_or_si128(wide_out, _mm_add_epi32(
+                                        _mm_srai_epi32(wide_fx, 16), wide_1)); 
+                
+                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out);
+      
+                xy += 4;
+                fx += dx * 4;
+                wide_fx  = _mm_add_epi32(wide_fx, wide_dx4);
+                count -= 4;
+            } // while count >= 4
+        } // if count >= 4
+
+        while (count-- > 0) {
+            SkASSERT((fx >> (16 + 14)) == 0);
+            *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
+            fx += dx;
+        }
+    } else {
+        // SSE2 only support 16bit interger max & min, so only process the case
+        // maxX less than the max 16bit interger. Actually maxX is the bitmap's
+        // height, there should be rare bitmap whose height will be greater 
+        // than max 16bit interger in the real world.
+        if ((count >= 4) && (maxX <= 0xFFFF)) {
+            while (((size_t)xy & 0x0F) != 0) {
+                *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
+                fx += dx;
+                count--;
+            }
+    
+            __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
+                                              fx + dx, fx);
+            __m128i wide_dx4  = _mm_set1_epi32(dx * 4);
+            __m128i wide_one  = _mm_set1_epi32(one);
+            __m128i wide_maxX = _mm_set1_epi32(maxX); 
+            __m128i wide_mask = _mm_set1_epi32(0xF);
+
+             while (count >= 4) {
+                __m128i wide_i;
+                __m128i wide_lo;
+                __m128i wide_fx1;
+
+                // i = SkClampMax(f>>16,maxX)
+                wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16), 
+                                       _mm_setzero_si128());
+                wide_i = _mm_min_epi16(wide_i, wide_maxX);
+    
+                // i<<4 | TILEX_LOW_BITS(fx)
+                wide_lo = _mm_srli_epi32(wide_fx, 12);
+                wide_lo = _mm_and_si128(wide_lo, wide_mask);
+                wide_i  = _mm_slli_epi32(wide_i, 4);         
+                wide_i  = _mm_or_si128(wide_i, wide_lo);     
+    
+                // i<<14
+                wide_i = _mm_slli_epi32(wide_i, 14);
+    
+                // SkClampMax(((f+one))>>16,max)
+                wide_fx1 = _mm_add_epi32(wide_fx, wide_one);
+                wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16), 
+                                                        _mm_setzero_si128());
+                wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX);
+                    
+                // final combination
+                wide_i = _mm_or_si128(wide_i, wide_fx1);
+                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i); 
+    
+                wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
+                fx += dx * 4;   
+                xy += 4;
+                count -= 4;
+            } // while count >= 4
+        } // if count >= 4
+
+        while (count-- > 0) {
+            *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
+            fx += dx;
+        }
+    }
+}
+
+/*  SSE version of ClampX_ClampY_nofilter_scale()
+ *  portable version is in core/SkBitmapProcState_matrix.h
+ */
+void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
+                                    uint32_t xy[], int count, int x, int y) {
+    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
+                             SkMatrix::kScale_Mask)) == 0);
+
+    // we store y, x, x, x, x, x
+    const unsigned maxX = s.fBitmap->width() - 1;
+    SkFixed fx;
+    SkPoint pt;
+    s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
+                                SkIntToScalar(y) + SK_ScalarHalf, &pt);
+    fx = SkScalarToFixed(pt.fY);
+    const unsigned maxY = s.fBitmap->height() - 1;
+    *xy++ = SkClampMax(fx >> 16, maxY);
+    fx = SkScalarToFixed(pt.fX);
+    
+    if (0 == maxX) {
+        // all of the following X values must be 0
+        memset(xy, 0, count * sizeof(uint16_t));
+        return;
+    }
+
+    const SkFixed dx = s.fInvSx;
+
+    // test if we don't need to apply the tile proc
+    if ((unsigned)(fx >> 16) <= maxX &&
+        (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
+        // SSE version of decal_nofilter_scale
+        if (count >= 8) {
+            while (((size_t)xy & 0x0F) != 0) {
+                *xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
+                fx += 2 * dx;
+                count -= 2;
+            }
+
+            __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
+            __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
+
+            __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
+                                             fx + dx, fx);
+            __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
+
+            while (count >= 8) {
+                __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
+                __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
+
+                __m128i wide_result = _mm_packs_epi32(wide_out_low,
+                                                      wide_out_high);
+                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
+            
+                wide_low = _mm_add_epi32(wide_low, wide_dx8);
+                wide_high = _mm_add_epi32(wide_high, wide_dx8);
+
+                xy += 4;
+                fx += dx * 8;
+                count -= 8;
+            }
+        } // if count >= 8
+
+        uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
+        while (count-- > 0) {
+            *xx++ = SkToU16(fx >> 16);
+            fx += dx;
+        }
+    } else {
+        // SSE2 only support 16bit interger max & min, so only process the case
+        // maxX less than the max 16bit interger. Actually maxX is the bitmap's
+        // height, there should be rare bitmap whose height will be greater 
+        // than max 16bit interger in the real world.
+        if ((count >= 8) && (maxX <= 0xFFFF)) {
+            while (((size_t)xy & 0x0F) != 0) {
+                *xy++ = SkClampMax((fx + dx) >> 16, maxX) | 
+                                   SkClampMax(fx >> 16, maxX);
+                fx += 2 * dx;
+                count -= 2;
+            }
+
+            __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
+            __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
+
+            __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
+                                             fx + dx, fx);
+            __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
+            __m128i wide_maxX = _mm_set1_epi32(maxX);
+
+            while (count >= 8) {
+                __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
+                __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
+
+                wide_out_low  = _mm_max_epi16(wide_out_low, 
+                                              _mm_setzero_si128());
+                wide_out_low  = _mm_min_epi16(wide_out_low, wide_maxX);
+                wide_out_high = _mm_max_epi16(wide_out_high,
+                                              _mm_setzero_si128());
+                wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX);
+
+                __m128i wide_result = _mm_packs_epi32(wide_out_low,
+                                                      wide_out_high);
+                _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
+
+                wide_low  = _mm_add_epi32(wide_low, wide_dx8);
+                wide_high = _mm_add_epi32(wide_high, wide_dx8);
+
+                xy += 4;
+                fx += dx * 8;
+                count -= 8;
+            }
+        } // if count >= 8
+
+        uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
+        while (count-- > 0) {
+            *xx++ = SkClampMax(fx >> 16, maxX);
+            fx += dx;
+        }
+    }
+}
+
+/*  SSE version of ClampX_ClampY_filter_affine()
+ *  portable version is in core/SkBitmapProcState_matrix.h
+ */
+void ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState& s,
+                                      uint32_t xy[], int count, int x, int y) {
+    SkPoint srcPt;
+    s.fInvProc(*s.fInvMatrix,
+               SkIntToScalar(x) + SK_ScalarHalf,
+               SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
+    
+    SkFixed oneX = s.fFilterOneX;
+    SkFixed oneY = s.fFilterOneY;
+    SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);
+    SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);
+    SkFixed dx = s.fInvSx;
+    SkFixed dy = s.fInvKy;
+    unsigned maxX = s.fBitmap->width() - 1;
+    unsigned maxY = s.fBitmap->height() - 1;
+
+    if (count >= 2 && (maxX <= 0xFFFF)) {
+        SkFixed dx2 = dx + dx;
+        SkFixed dy2 = dy + dy;
+
+        __m128i wide_f = _mm_set_epi32(fx + dx, fy + dy, fx, fy);
+        __m128i wide_d2  = _mm_set_epi32(dx2, dy2, dx2, dy2);
+        __m128i wide_one  = _mm_set_epi32(oneX, oneY, oneX, oneY);
+        __m128i wide_max = _mm_set_epi32(maxX, maxY, maxX, maxY); 
+        __m128i wide_mask = _mm_set1_epi32(0xF);
+
+        while (count >= 2) {
+            // i = SkClampMax(f>>16,maxX)
+            __m128i wide_i = _mm_max_epi16(_mm_srli_epi32(wide_f, 16), 
+                                           _mm_setzero_si128());
+            wide_i = _mm_min_epi16(wide_i, wide_max);
+    
+            // i<<4 | TILEX_LOW_BITS(f)
+            __m128i wide_lo = _mm_srli_epi32(wide_f, 12);
+            wide_lo = _mm_and_si128(wide_lo, wide_mask);
+            wide_i  = _mm_slli_epi32(wide_i, 4);         
+            wide_i  = _mm_or_si128(wide_i, wide_lo);     
+    
+            // i<<14
+            wide_i = _mm_slli_epi32(wide_i, 14);
+    
+            // SkClampMax(((f+one))>>16,max)
+            __m128i wide_f1 = _mm_add_epi32(wide_f, wide_one);
+            wide_f1 = _mm_max_epi16(_mm_srli_epi32(wide_f1, 16), 
+                                                   _mm_setzero_si128());
+            wide_f1 = _mm_min_epi16(wide_f1, wide_max);
+                    
+            // final combination
+            wide_i = _mm_or_si128(wide_i, wide_f1);
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(xy), wide_i); 
+    
+            wide_f = _mm_add_epi32(wide_f, wide_d2);
+
+            fx += dx2; 
+            fy += dy2;
+            xy += 4;
+            count -= 2;
+        } // while count >= 2
+    } // if count >= 2
+
+    while (count-- > 0) {
+        *xy++ = ClampX_ClampY_pack_filter(fy, maxY, oneY);
+        fy += dy;
+        *xy++ = ClampX_ClampY_pack_filter(fx, maxX, oneX);
+        fx += dx;          
+    }
+}
+
+/*  SSE version of ClampX_ClampY_nofilter_affine()
+ *  portable version is in core/SkBitmapProcState_matrix.h
+ */
+void ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s,
+                                      uint32_t xy[], int count, int x, int y) {
+    SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
+    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
+                             SkMatrix::kScale_Mask |
+                             SkMatrix::kAffine_Mask)) == 0);
+
+    SkPoint srcPt;
+    s.fInvProc(*s.fInvMatrix,
+               SkIntToScalar(x) + SK_ScalarHalf,
+               SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
+    
+    SkFixed fx = SkScalarToFixed(srcPt.fX);
+    SkFixed fy = SkScalarToFixed(srcPt.fY);
+    SkFixed dx = s.fInvSx;
+    SkFixed dy = s.fInvKy;
+    int maxX = s.fBitmap->width() - 1;
+    int maxY = s.fBitmap->height() - 1;
+
+    if (count >= 4 && (maxX <= 0xFFFF)) {
+        while (((size_t)xy & 0x0F) != 0) {
+            *xy++ = (SkClampMax(fy >> 16, maxY) << 16) | 
+                                  SkClampMax(fx >> 16, maxX);
+            fx += dx;
+            fy += dy;
+            count--;
+        }
+
+        SkFixed dx4 = dx * 4;
+        SkFixed dy4 = dy * 4;
+
+        __m128i wide_fx   = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
+                                          fx + dx, fx);
+        __m128i wide_fy   = _mm_set_epi32(fy + dy * 3, fy + dy * 2,
+                                          fy + dy, fy);
+        __m128i wide_dx4  = _mm_set1_epi32(dx4);
+        __m128i wide_dy4  = _mm_set1_epi32(dy4);
+
+        __m128i wide_maxX = _mm_set1_epi32(maxX); 
+        __m128i wide_maxY = _mm_set1_epi32(maxY); 
+
+        while (count >= 4) {
+            // SkClampMax(fx>>16,maxX)
+            __m128i wide_lo = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16), 
+                                            _mm_setzero_si128());
+            wide_lo = _mm_min_epi16(wide_lo, wide_maxX);
+    
+            // SkClampMax(fy>>16,maxY)
+            __m128i wide_hi = _mm_max_epi16(_mm_srli_epi32(wide_fy, 16), 
+                                            _mm_setzero_si128());
+            wide_hi = _mm_min_epi16(wide_hi, wide_maxY);
+                    
+            // final combination
+            __m128i wide_i = _mm_or_si128(_mm_slli_epi32(wide_hi, 16),
+                                          wide_lo);
+            _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i); 
+ 
+            wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
+            wide_fy = _mm_add_epi32(wide_fy, wide_dy4);
+
+            fx += dx4; 
+            fy += dy4;
+            xy += 4;
+            count -= 4;
+        } // while count >= 4
+    } // if count >= 4
+
+    while (count-- > 0) {
+        *xy++ = (SkClampMax(fy >> 16, maxY) << 16) |
+                              SkClampMax(fx >> 16, maxX);
+        fx += dx;
+        fy += dy;           
+    }
+}
diff --git a/src/opts/SkBitmapProcState_opts_SSE2.h b/src/opts/SkBitmapProcState_opts_SSE2.h
index 9e56642..3fdf696 100644
--- a/src/opts/SkBitmapProcState_opts_SSE2.h
+++ b/src/opts/SkBitmapProcState_opts_SSE2.h
@@ -17,3 +17,11 @@
                                   int count, uint32_t* colors);
 void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
                   SkPMColor color);
+void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
+                                     int count, int x, int y);
+void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
+                                       uint32_t xy[], int count, int x, int y);
+void ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState& s,
+                                      uint32_t xy[], int count, int x, int y);
+void ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s,
+                                       uint32_t xy[], int count, int x, int y);
diff --git a/src/opts/SkBitmapProcState_opts_SSSE3.cpp b/src/opts/SkBitmapProcState_opts_SSSE3.cpp
new file mode 100644
index 0000000..98b3445
--- /dev/null
+++ b/src/opts/SkBitmapProcState_opts_SSSE3.cpp
@@ -0,0 +1,497 @@
+/*
+ * Copyright 2012 The Android Open Source Project
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include <tmmintrin.h>  // SSSE3
+#include "SkBitmapProcState_opts_SSSE3.h"
+#include "SkUtils.h"
+
+// adding anonymous namespace seemed to force gcc to inline directly the
+// instantiation, instead of creating the functions
+// S32_generic_D32_filter_DX_SSSE3<true> and
+// S32_generic_D32_filter_DX_SSSE3<false> which were then called by the
+// external functions.
+namespace {
+// In this file, variations for alpha and non alpha versions are implemented
+// with a template, as it makes the code more compact and a bit easier to
+// maintain, while making the compiler generate the same exact code as with
+// two functions that only differ by a few lines.
+
+
+// Prepare all necessary constants for a round of processing for two pixel
+// pairs.
+// @param xy is the location where the xy parameters for four pixels should be
+//           read from. It is identical in concept with argument two of
+//           S32_{opaque}_D32_filter_DX methods.
+// @param mask_3FFF vector of 32 bit constants containing 3FFF,
+//                  suitable to mask the bottom 14 bits of a XY value.
+// @param mask_000F vector of 32 bit constants containing 000F,
+//                  suitable to mask the bottom 4 bits of a XY value.
+// @param sixteen_8bit vector of 8 bit components containing the value 16.
+// @param mask_dist_select vector of 8 bit components containing the shuffling
+//                         parameters to reorder x[0-3] parameters.
+// @param all_x_result vector of 8 bit components that will contain the
+//              (4x(x3), 4x(x2), 4x(x1), 4x(x0)) upon return.
+// @param sixteen_minus_x vector of 8 bit components, containing
+//              (4x(16 - x3), 4x(16 - x2), 4x(16 - x1), 4x(16 - x0))
+inline void PrepareConstantsTwoPixelPairs(const uint32_t* xy,
+                                          const __m128i& mask_3FFF,
+                                          const __m128i& mask_000F,
+                                          const __m128i& sixteen_8bit,
+                                          const __m128i& mask_dist_select,
+                                          __m128i* all_x_result,
+                                          __m128i* sixteen_minus_x,
+                                          int* x0,
+                                          int* x1) {
+    const __m128i xx = _mm_loadu_si128(reinterpret_cast<const __m128i *>(xy));
+
+    // 4 delta X
+    // (x03, x02, x01, x00)
+    const __m128i x0_wide = _mm_srli_epi32(xx, 18);
+    // (x13, x12, x11, x10)
+    const __m128i x1_wide = _mm_and_si128(xx, mask_3FFF);
+
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(x0), x0_wide);
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(x1), x1_wide);
+
+    __m128i all_x = _mm_and_si128(_mm_srli_epi32(xx, 14), mask_000F);
+
+    // (4x(x3), 4x(x2), 4x(x1), 4x(x0))
+    all_x = _mm_shuffle_epi8(all_x, mask_dist_select);
+
+    *all_x_result = all_x;
+    // (4x(16-x3), 4x(16-x2), 4x(16-x1), 4x(16-x0))
+    *sixteen_minus_x = _mm_sub_epi8(sixteen_8bit, all_x);
+}
+
+// Helper function used when processing one pixel pair.
+// @param pixel0..3 are the four input pixels
+// @param scale_x vector of 8 bit components to multiply the pixel[0:3]. This
+//                will contain (4x(x1, 16-x1), 4x(x0, 16-x0))
+//                or (4x(x3, 16-x3), 4x(x2, 16-x2))
+// @return a vector of 16 bit components containing:
+// (Aa2 * (16 - x1) + Aa3 * x1, ... , Ra0 * (16 - x0) + Ra1 * x0)
+inline __m128i ProcessPixelPairHelper(uint32_t pixel0,
+                                      uint32_t pixel1,
+                                      uint32_t pixel2,
+                                      uint32_t pixel3,
+                                      const __m128i& scale_x) {
+    __m128i a0, a1, a2, a3;
+    // Load 2 pairs of pixels
+    a0 = _mm_cvtsi32_si128(pixel0);
+    a1 = _mm_cvtsi32_si128(pixel1);
+
+    // Interleave pixels.
+    // (0, 0, 0, 0, 0, 0, 0, 0, Aa1, Aa0, Ba1, Ba0, Ga1, Ga0, Ra1, Ra0)
+    a0 = _mm_unpacklo_epi8(a0, a1);
+
+    a2 = _mm_cvtsi32_si128(pixel2);
+    a3 = _mm_cvtsi32_si128(pixel3);
+    // (0, 0, 0, 0, 0, 0, 0, 0, Aa3, Aa2, Ba3, Ba2, Ga3, Ga2, Ra3, Ra2)
+    a2 = _mm_unpacklo_epi8(a2, a3);
+
+    // two pairs of pixel pairs, interleaved.
+    // (Aa3, Aa2, Ba3, Ba2, Ga3, Ga2, Ra3, Ra2,
+    //  Aa1, Aa0, Ba1, Ba0, Ga1, Ga0, Ra1, Ra0)
+    a0 = _mm_unpacklo_epi64(a0, a2);
+
+    // multiply and sum to 16 bit components.
+    // (Aa2 * (16 - x1) + Aa3 * x1, ... , Ra0 * (16 - x0) + Ra1 * x0)
+    // At that point, we use up a bit less than 12 bits for each 16 bit
+    // component:
+    // All components are less than 255. So,
+    // C0 * (16 - x) + C1 * x <= 255 * (16 - x) + 255 * x = 255 * 16.
+    return _mm_maddubs_epi16(a0, scale_x);
+}
+
+// Scale back the results after multiplications to the [0:255] range, and scale
+// by alpha when has_alpha is true.
+// Depending on whether one set or two sets of multiplications had been applied,
+// the results have to be shifted by four places (dividing by 16), or shifted
+// by eight places (dividing by 256), since each multiplication is by a quantity
+// in the range [0:16].
+template<bool has_alpha, int scale>
+inline __m128i ScaleFourPixels(__m128i* pixels,
+                               const __m128i& alpha) {
+    // Divide each 16 bit component by 16 (or 256 depending on scale).
+    *pixels = _mm_srli_epi16(*pixels, scale);
+
+    if (has_alpha) {
+        // Multiply by alpha.
+        *pixels = _mm_mullo_epi16(*pixels, alpha);
+
+        // Divide each 16 bit component by 256.
+        *pixels = _mm_srli_epi16(*pixels, 8);
+    }
+    return *pixels;
+}
+
+// Wrapper to calculate two output pixels from four input pixels. The
+// arguments are the same as ProcessPixelPairHelper. Technically, there are
+// eight input pixels, but since sub_y == 0, the factors applied to half of the
+// pixels is zero (sub_y), and are therefore omitted here to save on some
+// processing.
+// @param alpha when has_alpha is true, scale all resulting components by this
+//              value.
+// @return a vector of 16 bit components containing:
+// ((Aa2 * (16 - x1) + Aa3 * x1) * alpha, ...,
+// (Ra0 * (16 - x0) + Ra1 * x0) * alpha) (when has_alpha is true)
+// otherwise
+// (Aa2 * (16 - x1) + Aa3 * x1, ... , Ra0 * (16 - x0) + Ra1 * x0)
+// In both cases, the results are renormalized (divided by 16) to match the
+// expected formats when storing back the results into memory.
+template<bool has_alpha>
+inline __m128i ProcessPixelPairZeroSubY(uint32_t pixel0,
+                                        uint32_t pixel1,
+                                        uint32_t pixel2,
+                                        uint32_t pixel3,
+                                        const __m128i& scale_x,
+                                        const __m128i& alpha) {
+    __m128i sum = ProcessPixelPairHelper(pixel0, pixel1, pixel2, pixel3,
+                                         scale_x);
+    return ScaleFourPixels<has_alpha, 4>(&sum, alpha);
+}
+
+// Same as ProcessPixelPairZeroSubY, expect processing one output pixel at a
+// time instead of two. As in the above function, only two pixels are needed
+// to generate a single pixel since sub_y == 0.
+// @return same as ProcessPixelPairZeroSubY, except that only the bottom 4
+// 16 bit components are set.
+template<bool has_alpha>
+inline __m128i ProcessOnePixelZeroSubY(uint32_t pixel0,
+                                       uint32_t pixel1,
+                                       __m128i scale_x,
+                                       __m128i alpha) {
+    __m128i a0 = _mm_cvtsi32_si128(pixel0);
+    __m128i a1 = _mm_cvtsi32_si128(pixel1);
+
+    // Interleave
+    a0 = _mm_unpacklo_epi8(a0, a1);
+
+    // (a0 * (16-x) + a1 * x)
+    __m128i sum = _mm_maddubs_epi16(a0, scale_x);
+
+    return ScaleFourPixels<has_alpha, 4>(&sum, alpha);
+}
+
+// Methods when sub_y != 0
+
+
+// Same as ProcessPixelPairHelper, except that the values are scaled by y.
+// @param y vector of 16 bit components containing 'y' values. There are two
+//        cases in practice, where y will contain the sub_y constant, or will
+//        contain the 16 - sub_y constant.
+// @return vector of 16 bit components containing:
+// (y * (Aa2 * (16 - x1) + Aa3 * x1), ... , y * (Ra0 * (16 - x0) + Ra1 * x0))
+inline __m128i ProcessPixelPair(uint32_t pixel0,
+                                uint32_t pixel1,
+                                uint32_t pixel2,
+                                uint32_t pixel3,
+                                const __m128i& scale_x,
+                                const __m128i& y) {
+    __m128i sum = ProcessPixelPairHelper(pixel0, pixel1, pixel2, pixel3,
+                                         scale_x);
+
+    // first row times 16-y or y depending on whether 'y' represents one or
+    // the other.
+    // Values will be up to 255 * 16 * 16 = 65280.
+    // (y * (Aa2 * (16 - x1) + Aa3 * x1), ... ,
+    //  y * (Ra0 * (16 - x0) + Ra1 * x0))
+    sum = _mm_mullo_epi16(sum, y);
+
+    return sum;
+}
+
+// Process two pixel pairs out of eight input pixels.
+// In other methods, the distinct pixels are passed one by one, but in this
+// case, the rows, and index offsets to the pixels into the row are passed
+// to generate the 8 pixels.
+// @param row0..1 top and bottom row where to find input pixels.
+// @param x0..1 offsets into the row for all eight input pixels.
+// @param all_y vector of 16 bit components containing the constant sub_y
+// @param neg_y vector of 16 bit components containing the constant 16 - sub_y
+// @param alpha vector of 16 bit components containing the alpha value to scale
+//        the results by, when has_alpha is true.
+// @return
+// (alpha * ((16-y) * (Aa2  * (16-x1) + Aa3  * x1) +
+//             y    * (Aa2' * (16-x1) + Aa3' * x1)),
+// ...
+//  alpha * ((16-y) * (Ra0  * (16-x0) + Ra1 * x0) +
+//             y    * (Ra0' * (16-x0) + Ra1' * x0))
+// With the factor alpha removed when has_alpha is false.
+// The values are scaled back to 16 bit components, but with only the bottom
+// 8 bits being set.
+template<bool has_alpha>
+inline __m128i ProcessTwoPixelPairs(const uint32_t* row0,
+                                    const uint32_t* row1,
+                                    const int* x0,
+                                    const int* x1,
+                                    const __m128i& scale_x,
+                                    const __m128i& all_y,
+                                    const __m128i& neg_y,
+                                    const __m128i& alpha) {
+    __m128i sum0 = ProcessPixelPair(
+        row0[x0[0]], row0[x1[0]], row0[x0[1]], row0[x1[1]],
+        scale_x, neg_y);
+    __m128i sum1 = ProcessPixelPair(
+        row1[x0[0]], row1[x1[0]], row1[x0[1]], row1[x1[1]],
+        scale_x, all_y);
+
+    // 2 samples fully summed.
+    // ((16-y) * (Aa2 * (16-x1) + Aa3 * x1) +
+    //  y * (Aa2' * (16-x1) + Aa3' * x1),
+    // ...
+    //  (16-y) * (Ra0 * (16 - x0) + Ra1 * x0)) +
+    //  y * (Ra0' * (16-x0) + Ra1' * x0))
+    // Each component, again can be at most 256 * 255 = 65280, so no overflow.
+    sum0 = _mm_add_epi16(sum0, sum1);
+
+    return ScaleFourPixels<has_alpha, 8>(&sum0, alpha);
+}
+
+
+// Same as ProcessPixelPair, except that performing the math one output pixel
+// at a time. This means that only the bottom four 16 bit components are set.
+inline __m128i ProcessOnePixel(uint32_t pixel0, uint32_t pixel1,
+                               const __m128i& scale_x, const __m128i& y) {
+    __m128i a0 = _mm_cvtsi32_si128(pixel0);
+    __m128i a1 = _mm_cvtsi32_si128(pixel1);
+
+    // Interleave
+    // (0, 0, 0, 0, 0, 0, 0, 0, Aa1, Aa0, Ba1, Ba0, Ga1, Ga0, Ra1, Ra0)
+    a0 = _mm_unpacklo_epi8(a0, a1);
+
+    // (a0 * (16-x) + a1 * x)
+    a0 = _mm_maddubs_epi16(a0, scale_x);
+
+    // scale row by y
+    return _mm_mullo_epi16(a0, y);
+}
+
+// Notes about the various tricks that are used in this implementation:
+// - specialization for sub_y == 0.
+// Statistically, 1/16th of the samples will have sub_y == 0. When this
+// happens, the math goes from:
+// (16 - x)*(16 - y)*a00 + x*(16 - y)*a01 + (16 - x)*y*a10 + x*y*a11
+// to:
+// (16 - x)*a00 + 16*x*a01
+// much simpler. The simplification makes for an easy boost in performance.
+// - calculating 4 output pixels at a time.
+//  This allows loading the coefficients x0 and x1 and shuffling them to the
+// optimum location only once per loop, instead of twice per loop.
+// This also allows us to store the four pixels with a single store.
+// - Use of 2 special SSSE3 instructions (comparatively to the SSE2 instruction
+// version):
+// _mm_shuffle_epi8 : this allows us to spread the coefficients x[0-3] loaded
+// in 32 bit values to 8 bit values repeated four times.
+// _mm_maddubs_epi16 : this allows us to perform multiplications and additions
+// in one swoop of 8bit values storing the results in 16 bit values. This
+// instruction is actually crucial for the speed of the implementation since
+// as one can see in the SSE2 implementation, all inputs have to be used as
+// 16 bits because the results are 16 bits. This basically allows us to process
+// twice as many pixel components per iteration.
+//
+// As a result, this method behaves faster than the traditional SSE2. The actual
+// boost varies greatly on the underlying architecture.
+template<bool has_alpha>
+void S32_generic_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
+                                     const uint32_t* xy,
+                                     int count, uint32_t* colors) {
+    SkASSERT(count > 0 && colors != NULL);
+    SkASSERT(s.fDoFilter);
+    SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
+    if (has_alpha) {
+        SkASSERT(s.fAlphaScale < 256);
+    } else {
+        SkASSERT(s.fAlphaScale == 256);
+    }
+
+    const uint8_t* src_addr =
+            static_cast<const uint8_t*>(s.fBitmap->getPixels());
+    const unsigned rb = s.fBitmap->rowBytes();
+    const uint32_t XY = *xy++;
+    const unsigned y0 = XY >> 14;
+    const uint32_t* row0 =
+            reinterpret_cast<const uint32_t*>(src_addr + (y0 >> 4) * rb);
+    const uint32_t* row1 =
+            reinterpret_cast<const uint32_t*>(src_addr + (XY & 0x3FFF) * rb);
+    const unsigned sub_y = y0 & 0xF;
+
+    // vector constants
+    const __m128i mask_dist_select = _mm_set_epi8(12, 12, 12, 12,
+                                                  8,  8,  8,  8,
+                                                  4,  4,  4,  4,
+                                                  0,  0,  0,  0);
+    const __m128i mask_3FFF = _mm_set1_epi32(0x3FFF);
+    const __m128i mask_000F = _mm_set1_epi32(0x000F);
+    const __m128i sixteen_8bit = _mm_set1_epi8(16);
+    // (0, 0, 0, 0, 0, 0, 0, 0)
+    const __m128i zero = _mm_setzero_si128();
+
+    __m128i alpha;
+    if (has_alpha)
+        // 8x(alpha)
+        alpha = _mm_set1_epi16(s.fAlphaScale);
+
+    if (sub_y == 0) {
+        // Unroll 4x, interleave bytes, use pmaddubsw (all_x is small)
+        while (count > 3) {
+            count -= 4;
+
+            int x0[4];
+            int x1[4];
+            __m128i all_x, sixteen_minus_x;
+            PrepareConstantsTwoPixelPairs(xy, mask_3FFF, mask_000F,
+                                          sixteen_8bit, mask_dist_select,
+                                          &all_x, &sixteen_minus_x, x0, x1);
+            xy += 4;
+
+            // First pair of pixel pairs.
+            // (4x(x1, 16-x1), 4x(x0, 16-x0))
+            __m128i scale_x;
+            scale_x = _mm_unpacklo_epi8(sixteen_minus_x, all_x);
+
+            __m128i sum0 = ProcessPixelPairZeroSubY<has_alpha>(
+                row0[x0[0]], row0[x1[0]], row0[x0[1]], row0[x1[1]],
+                scale_x, alpha);
+
+            // second pair of pixel pairs
+            // (4x (x3, 16-x3), 4x (16-x2, x2))
+            scale_x = _mm_unpackhi_epi8(sixteen_minus_x, all_x);
+
+            __m128i sum1 = ProcessPixelPairZeroSubY<has_alpha>(
+                row0[x0[2]], row0[x1[2]], row0[x0[3]], row0[x1[3]],
+                scale_x, alpha);
+
+            // Pack lower 4 16 bit values of sum into lower 4 bytes.
+            sum0 = _mm_packus_epi16(sum0, sum1);
+
+            // Extract low int and store.
+            _mm_storeu_si128(reinterpret_cast<__m128i *>(colors), sum0);
+
+            colors += 4;
+        }
+
+        // handle remainder
+        while (count-- > 0) {
+            uint32_t xx = *xy++;  // x0:14 | 4 | x1:14
+            unsigned x0 = xx >> 18;
+            unsigned x1 = xx & 0x3FFF;
+
+            // 16x(x)
+            const __m128i all_x = _mm_set1_epi8((xx >> 14) & 0x0F);
+
+            // (16x(16-x))
+            __m128i scale_x = _mm_sub_epi8(sixteen_8bit, all_x);
+
+            scale_x = _mm_unpacklo_epi8(scale_x, all_x);
+
+            __m128i sum = ProcessOnePixelZeroSubY<has_alpha>(
+                row0[x0], row0[x1],
+                scale_x, alpha);
+
+            // Pack lower 4 16 bit values of sum into lower 4 bytes.
+            sum = _mm_packus_epi16(sum, zero);
+
+            // Extract low int and store.
+            *colors++ = _mm_cvtsi128_si32(sum);
+        }
+    } else {  // more general case, y != 0
+        // 8x(16)
+        const __m128i sixteen_16bit = _mm_set1_epi16(16);
+
+        // 8x (y)
+        const __m128i all_y = _mm_set1_epi16(sub_y);
+
+        // 8x (16-y)
+        const __m128i neg_y = _mm_sub_epi16(sixteen_16bit, all_y);
+
+        // Unroll 4x, interleave bytes, use pmaddubsw (all_x is small)
+        while (count > 3) {
+            count -= 4;
+
+            int x0[4];
+            int x1[4];
+            __m128i all_x, sixteen_minus_x;
+            PrepareConstantsTwoPixelPairs(xy, mask_3FFF, mask_000F,
+                                          sixteen_8bit, mask_dist_select,
+                                          &all_x, &sixteen_minus_x, x0, x1);
+            xy += 4;
+
+            // First pair of pixel pairs
+            // (4x(x1, 16-x1), 4x(x0, 16-x0))
+            __m128i scale_x;
+            scale_x = _mm_unpacklo_epi8(sixteen_minus_x, all_x);
+
+            __m128i sum0 = ProcessTwoPixelPairs<has_alpha>(
+                row0, row1, x0, x1,
+                scale_x, all_y, neg_y, alpha);
+
+            // second pair of pixel pairs
+            // (4x (x3, 16-x3), 4x (16-x2, x2))
+            scale_x = _mm_unpackhi_epi8(sixteen_minus_x, all_x);
+
+            __m128i sum1 = ProcessTwoPixelPairs<has_alpha>(
+                row0, row1, x0 + 2, x1 + 2,
+                scale_x, all_y, neg_y, alpha);
+
+            // Do the final packing of the two results
+
+            // Pack lower 4 16 bit values of sum into lower 4 bytes.
+            sum0 = _mm_packus_epi16(sum0, sum1);
+
+            // Extract low int and store.
+            _mm_storeu_si128(reinterpret_cast<__m128i *>(colors), sum0);
+
+            colors += 4;
+        }
+
+        // Left over.
+        while (count-- > 0) {
+            const uint32_t xx = *xy++;  // x0:14 | 4 | x1:14
+            const unsigned x0 = xx >> 18;
+            const unsigned x1 = xx & 0x3FFF;
+
+            // 16x(x)
+            const __m128i all_x = _mm_set1_epi8((xx >> 14) & 0x0F);
+
+            // 16x (16-x)
+            __m128i scale_x = _mm_sub_epi8(sixteen_8bit, all_x);
+
+            // (8x (x, 16-x))
+            scale_x = _mm_unpacklo_epi8(scale_x, all_x);
+
+            // first row.
+            __m128i sum0 = ProcessOnePixel(row0[x0], row0[x1], scale_x, neg_y);
+            // second row.
+            __m128i sum1 = ProcessOnePixel(row1[x0], row1[x1], scale_x, all_y);
+
+            // Add both rows for full sample
+            sum0 = _mm_add_epi16(sum0, sum1);
+
+            sum0 = ScaleFourPixels<has_alpha, 8>(&sum0, alpha);
+
+            // Pack lower 4 16 bit values of sum into lower 4 bytes.
+            sum0 = _mm_packus_epi16(sum0, zero);
+
+            // Extract low int and store.
+            *colors++ = _mm_cvtsi128_si32(sum0);
+        }
+    }
+}
+}  // namepace
+
+void S32_opaque_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
+                                    const uint32_t* xy,
+                                    int count, uint32_t* colors) {
+    S32_generic_D32_filter_DX_SSSE3<false>(s, xy, count, colors);
+}
+
+void S32_alpha_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
+                                   const uint32_t* xy,
+                                   int count, uint32_t* colors) {
+    S32_generic_D32_filter_DX_SSSE3<true>(s, xy, count, colors);
+}
diff --git a/src/opts/SkBitmapProcState_opts_SSSE3.h b/src/opts/SkBitmapProcState_opts_SSSE3.h
new file mode 100644
index 0000000..d21e7e4
--- /dev/null
+++ b/src/opts/SkBitmapProcState_opts_SSSE3.h
@@ -0,0 +1,15 @@
+/*
+ * Copyright 2012 The Android Open Source Project
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkBitmapProcState.h"
+
+void S32_opaque_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
+                                    const uint32_t* xy,
+                                    int count, uint32_t* colors);
+void S32_alpha_D32_filter_DX_SSSE3(const SkBitmapProcState& s,
+                                   const uint32_t* xy,
+                                   int count, uint32_t* colors);
diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp
index f03468f..8e4dd1d 100644
--- a/src/opts/SkBlitRow_opts_SSE2.cpp
+++ b/src/opts/SkBlitRow_opts_SSE2.cpp
@@ -1,6 +1,5 @@
-
 /*
- * Copyright 2009 The Android Open Source Project
+ * Copyright 2012 The Android Open Source Project
  *
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
@@ -39,34 +38,54 @@
         const __m128i *s = reinterpret_cast<const __m128i*>(src);
         __m128i *d = reinterpret_cast<__m128i*>(dst);
         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
-        __m128i src_scale_wide = _mm_set1_epi16(src_scale);
-        __m128i dst_scale_wide = _mm_set1_epi16(dst_scale);
+        __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
+
+        // Move scale factors to upper byte of word
+        __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
+        __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
         while (count >= 4) {
             // Load 4 pixels each of src and dest.
             __m128i src_pixel = _mm_loadu_si128(s);
             __m128i dst_pixel = _mm_load_si128(d);
 
+            // Interleave Atom port 0/1 operations based on the execution port
+            // constraints that multiply can only be executed on port 0 (while
+            // boolean operations can be executed on either port 0 or port 1)
+            // because GCC currently doesn't do a good job scheduling
+            // instructions based on these constraints.
+
             // Get red and blue pixels into lower byte of each word.
-            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
+            // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
 
-            // Get alpha and green into lower byte of each word.
-            __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
-            __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
+            // Multiply by scale.
+            // (4 x (0, rs.h, 0, bs.h))
+            // where rs.h stands for the higher byte of r * scale, and
+            // bs.h the higher byte of b * scale.
+            src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
+
+            // Get alpha and green pixels into higher byte of each word.
+            // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
+            __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
 
             // Multiply by scale.
-            src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
-            src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
-            dst_rb = _mm_mullo_epi16(dst_rb, dst_scale_wide);
-            dst_ag = _mm_mullo_epi16(dst_ag, dst_scale_wide);
+            // (4 x (as.h, as.l, gs.h, gs.l))
+            src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
 
-            // Divide by 256.
-            src_rb = _mm_srli_epi16(src_rb, 8);
-            dst_rb = _mm_srli_epi16(dst_rb, 8);
-            src_ag = _mm_andnot_si128(rb_mask, src_ag);
-            dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
+            // Clear the lower byte of the a*scale and g*scale results
+            // (4 x (as.h, 0, gs.h, 0))
+            src_ag = _mm_and_si128(src_ag, ag_mask);
+
+            // Operations the destination pixels are the same as on the
+            // source pixels. See the comments above.
+            __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
+            dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
+            __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
+            dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
+            dst_ag = _mm_and_si128(dst_ag, ag_mask);
 
             // Combine back into RGBA.
+            // (4 x (as.h, rs.h, gs.h, bs.h))
             src_pixel = _mm_or_si128(src_rb, src_ag);
             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
 
@@ -234,7 +253,7 @@
 
         const __m128i *s = reinterpret_cast<const __m128i*>(src);
         __m128i *d = reinterpret_cast<__m128i*>(dst);
-        __m128i src_scale_wide = _mm_set1_epi16(src_scale);
+        __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
         __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
         while (count >= 4) {
@@ -251,14 +270,17 @@
             __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
 
             // Put per-pixel alpha in low byte of each word.
+            // After the following two statements, the dst_alpha looks like
+            // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
             __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
             dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
 
             // dst_alpha = dst_alpha * src_scale
-            dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
-
-            // Divide by 256.
-            dst_alpha = _mm_srli_epi16(dst_alpha, 8);
+            // Because src_scales are in the higher byte of each word and
+            // we use mulhi here, the resulting alpha values are already
+            // in the right place and don't need to be divided by 256.
+            // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
+            dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
 
             // Subtract alphas from 256, to get 1..256
             dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
@@ -269,17 +291,25 @@
             dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
 
             // Multiply red and blue by global alpha.
-            src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
+            // (4 x (0, rs.h, 0, bs.h))
+            // where rs.h stands for the higher byte of r * src_scale,
+            // and bs.h the higher byte of b * src_scale.
+            // Again, because we use mulhi, the resuling red and blue
+            // values are already in the right place and don't need to
+            // be divided by 256.
+            src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
             // Multiply alpha and green by global alpha.
-            src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
+            // (4 x (0, as.h, 0, gs.h))
+            src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
 
             // Divide by 256.
             dst_rb = _mm_srli_epi16(dst_rb, 8);
-            src_rb = _mm_srli_epi16(src_rb, 8);
 
             // Mask out low bits (goodies already in the right place; no need to divide)
             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
-            src_ag = _mm_andnot_si128(rb_mask, src_ag);
+            // Shift alpha and green to higher byte of each word.
+            // (4 x (as.h, 0, gs.h, 0))
+            src_ag = _mm_slli_epi16(src_ag, 8);
 
             // Combine back into RGBA.
             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
@@ -386,8 +416,7 @@
 
 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
                                size_t maskRB, SkColor origColor,
-                               int width, int height)
-{
+                               int width, int height) {
     SkPMColor color = SkPreMultiplyColor(origColor);
     size_t dstOffset = dstRB - (width << 2);
     size_t maskOffset = maskRB - width;
@@ -482,3 +511,226 @@
         mask += maskOffset;
     } while (--height != 0);
 }
+
+static __m128i SkBlendLCD16_SSE2(__m128i &srci, __m128i &dst,
+                                 __m128i &mask, __m128i &scale) {
+    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
+    __m128i r = _mm_and_si128(_mm_slli_epi32(mask,
+                              16-SK_R16_SHIFT-(SK_R16_BITS-5)),
+                              _mm_set1_epi32(0x001F0000));
+
+    __m128i g = _mm_and_si128(_mm_slli_epi32(mask,
+                              8-SK_G16_SHIFT-(SK_G16_BITS-5)),
+                              _mm_set1_epi32(0x00001F00));
+
+    __m128i b = _mm_and_si128(_mm_slli_epi32(mask,
+                              SK_B16_BITS-5),
+                              _mm_set1_epi32(0x0000001F));
+            
+    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
+    mask = _mm_or_si128(_mm_or_si128(r, g), b);
+
+    // Interleave R,G,B into the lower byte of word. 
+    __m128i maskLo, maskHi;
+    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
+    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
+
+    // Upscale to 0..32
+    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
+    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
+
+    maskLo = _mm_mullo_epi16(maskLo, scale);
+    maskHi = _mm_mullo_epi16(maskHi, scale);
+
+    maskLo = _mm_srli_epi16(maskLo, 8);
+    maskHi = _mm_srli_epi16(maskHi, 8);
+
+    // Interleave R,G,B into the lower byte of the word.
+    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
+    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
+
+    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));
+    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));
+
+    maskLo = _mm_srai_epi16(maskLo, 5);
+    maskHi = _mm_srai_epi16(maskHi, 5);
+
+    // Add two pixels into result.
+    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
+    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
+
+    // Pack into 4 32bit dst pixels
+    return _mm_packus_epi16(resultLo, resultHi);
+}
+
+static __m128i SkBlendLCD16Opaque_SSE2(__m128i &srci, __m128i &dst, 
+                                       __m128i &mask) {
+    // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
+    __m128i r = _mm_and_si128(_mm_slli_epi32(mask,
+                              16-SK_R16_SHIFT-(SK_R16_BITS-5)),
+                              _mm_set1_epi32(0x001F0000));
+
+    __m128i g = _mm_and_si128(_mm_slli_epi32(mask,
+                              8-SK_G16_SHIFT-(SK_G16_BITS-5)),
+                              _mm_set1_epi32(0x00001F00));
+
+    __m128i b = _mm_and_si128(_mm_slli_epi32(mask, SK_B16_BITS-5),
+                              _mm_set1_epi32(0x0000001F));
+            
+    // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
+    mask = _mm_or_si128(_mm_or_si128(r, g), b);
+
+    // Interleave R,G,B into the lower byte of word. 
+    __m128i maskLo, maskHi;
+    maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
+    maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
+
+    // Upscale to 0..32
+    maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
+    maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
+
+    // Interleave R,G,B into the lower byte of the word.
+    __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
+    __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
+
+    maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));
+    maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));
+
+    maskLo = _mm_srai_epi16(maskLo, 5);
+    maskHi = _mm_srai_epi16(maskHi, 5);
+
+    // Add two pixels into result.
+    __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
+    __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
+
+    // Pack into 4 32bit dst pixels
+    return _mm_packus_epi16(resultLo, resultHi);
+}
+
+void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[],
+                         SkColor color, int width, SkPMColor) {
+    if (width <= 0) {
+        return;
+    }
+
+    int srcA = SkColorGetA(color);
+    int srcR = SkColorGetR(color);
+    int srcG = SkColorGetG(color);
+    int srcB = SkColorGetB(color);
+    
+    srcA = SkAlpha255To256(srcA);
+
+    if (width >= 4) {
+        SkASSERT(((size_t)dst & 0x03) == 0);
+        while (((size_t)dst & 0x0F) != 0) {
+            *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src);
+            src++;
+            dst++;
+            width--;
+        }
+
+        __m128i *d = reinterpret_cast<__m128i*>(dst);
+        __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
+        srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());
+        __m128i scale = _mm_set1_epi16(srcA);
+        while (width >= 4) {
+            __m128i dst_pixel = _mm_load_si128(d);
+            __m128i mask_pixel = _mm_loadl_epi64(
+                                     reinterpret_cast<const __m128i*>(src));
+
+            // Check whether mask_pixels are equal to 0 and get the highest bit
+            // of each byte of result, if mask pixes are all zero, we will get
+            // pack_cmp to 0xFFFF
+            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,
+                                             _mm_setzero_si128()));
+
+            // if mask pixels are not all zero, we will blend the dst pixels
+            if (pack_cmp != 0xFFFF) {
+                // Unpack 4 16bit mask pixels to 
+                // (p0, 0, p1, 0, p2, 0, p3, 0)
+                mask_pixel = _mm_unpacklo_epi16(mask_pixel,
+                                                _mm_setzero_si128());
+
+                // Process 4 32bit dst pixels
+                __m128i result = SkBlendLCD16_SSE2(srci, dst_pixel,
+                                                   mask_pixel, scale); 
+                _mm_store_si128(d, result);
+            }
+
+            d++;
+            src += 4;
+            width -= 4;
+        }
+
+        dst = reinterpret_cast<SkPMColor*>(d);
+    }
+
+    while (width > 0) {
+        *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src);
+        src++;
+        dst++;
+        width--;        
+    }
+}
+
+void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[],
+                               SkColor color, int width, SkPMColor opaqueDst) {
+    if (width <= 0) {
+        return;
+    }
+
+    int srcR = SkColorGetR(color);
+    int srcG = SkColorGetG(color);
+    int srcB = SkColorGetB(color);
+
+    if (width >= 4) {
+        SkASSERT(((size_t)dst & 0x03) == 0);
+        while (((size_t)dst & 0x0F) != 0) {
+            *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst);
+            src++;
+            dst++;
+            width--;
+        }
+
+        __m128i *d = reinterpret_cast<__m128i*>(dst);
+        __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
+        srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());
+        while (width >= 4) {
+            __m128i dst_pixel = _mm_load_si128(d);
+            __m128i mask_pixel = _mm_loadl_epi64(
+                                     reinterpret_cast<const __m128i*>(src));
+
+            // Check whether mask_pixels are equal to 0 and get the highest bit
+            // of each byte of result, if mask pixes are all zero, we will get
+            // pack_cmp to 0xFFFF
+            int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,
+                                             _mm_setzero_si128()));
+
+            // if mask pixels are not all zero, we will blend the dst pixels
+            if (pack_cmp != 0xFFFF) {
+                // Unpack 4 16bit mask pixels to 
+                // (p0, 0, p1, 0, p2, 0, p3, 0)
+                mask_pixel = _mm_unpacklo_epi16(mask_pixel,
+                                                _mm_setzero_si128());
+
+                // Process 4 32bit dst pixels
+                __m128i result = SkBlendLCD16Opaque_SSE2(srci, dst_pixel,
+                                                         mask_pixel); 
+                _mm_store_si128(d, result);
+            }
+
+            d++;
+            src += 4;
+            width -= 4;
+        }
+
+        dst = reinterpret_cast<SkPMColor*>(d);
+    }
+
+    while (width > 0) {
+        *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst);
+        src++;
+        dst++;
+        width--;        
+    }
+}
diff --git a/src/opts/SkBlitRow_opts_SSE2.h b/src/opts/SkBlitRow_opts_SSE2.h
index 8493e7a..b443ec7 100644
--- a/src/opts/SkBlitRow_opts_SSE2.h
+++ b/src/opts/SkBlitRow_opts_SSE2.h
@@ -23,3 +23,8 @@
 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* mask,
                                size_t maskRB, SkColor color,
                                int width, int height);
+
+void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[],
+                         SkColor color, int width, SkPMColor);
+void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[],
+                               SkColor color, int width, SkPMColor opaqueDst);
diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp
index 6d1ce79..20a82c8 100644
--- a/src/opts/SkBlitRow_opts_arm.cpp
+++ b/src/opts/SkBlitRow_opts_arm.cpp
@@ -1313,6 +1313,10 @@
     return NULL;
 }
 
+SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
+    return NULL;
+}
+
 SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig,
                                                  SkMask::Format maskFormat,
                                                  RowFlags flags) {
diff --git a/src/opts/SkBlitRow_opts_none.cpp b/src/opts/SkBlitRow_opts_none.cpp
index d58d2ea..5f4598e 100644
--- a/src/opts/SkBlitRow_opts_none.cpp
+++ b/src/opts/SkBlitRow_opts_none.cpp
@@ -31,7 +31,11 @@
 SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig,
                                                      SkMask::Format maskFormat,
                                                      SkColor color) {
-   return NULL;
+    return NULL;
+}
+
+SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
+    return NULL;
 }
 
 SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig,
diff --git a/src/opts/memset.arm.S b/src/opts/memset.arm.S
index bc0c060..1248631 100644
--- a/src/opts/memset.arm.S
+++ b/src/opts/memset.arm.S
@@ -32,6 +32,10 @@
         .fnstart
         push        {lr}
 
+        /* if count is equal to zero then abort */
+        teq         r2, #0
+        ble         .Lfinish
+
         /* Multiply count by 2 - go from the number of 16-bit shorts
          * to the number of bytes desired. */
         mov         r2, r2, lsl #1
@@ -52,6 +56,10 @@
         .fnstart
         push        {lr}
 
+        /* if count is equal to zero then abort */
+        teq         r2, #0
+        ble         .Lfinish
+
         /* Multiply count by 4 - go from the number of 32-bit words to
          * the number of bytes desired. */
         mov         r2, r2, lsl #2
@@ -97,5 +105,6 @@
         strcs       r1, [r0], #4
         strmih      lr, [r0], #2
 
+.Lfinish:
         pop         {pc}
         .fnend
diff --git a/src/opts/opts_check_SSE2.cpp b/src/opts/opts_check_SSE2.cpp
index 00497c9..be1b4a1 100644
--- a/src/opts/opts_check_SSE2.cpp
+++ b/src/opts/opts_check_SSE2.cpp
@@ -6,6 +6,7 @@
  */
 
 #include "SkBitmapProcState_opts_SSE2.h"
+#include "SkBitmapProcState_opts_SSSE3.h"
 #include "SkBlitMask.h"
 #include "SkBlitRow_opts_SSE2.h"
 #include "SkUtils_opts_SSE2.h"
@@ -16,12 +17,7 @@
    instruction on Pentium3 on the code below).  Only files named *_SSE2.cpp
    in this directory should be compiled with -msse2. */
 
-#if defined(__x86_64__) || defined(_WIN64)
-/* All x86_64 machines have SSE2, so don't even bother checking. */
-static inline bool hasSSE2() {
-    return true;
-}
-#else
+
 #ifdef _MSC_VER
 static inline void getcpuid(int info_type, int info[4]) {
     __asm {
@@ -35,6 +31,15 @@
     }
 }
 #else
+#if defined(__x86_64__)
+static inline void getcpuid(int info_type, int info[4]) {
+    asm volatile (
+        "cpuid \n\t"
+        : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
+        : "a"(info_type)
+    );
+}
+#else
 static inline void getcpuid(int info_type, int info[4]) {
     // We save and restore ebx, so this code can be compatible with -fPIC
     asm volatile (
@@ -47,6 +52,14 @@
     );
 }
 #endif
+#endif
+
+#if defined(__x86_64__) || defined(_WIN64)
+/* All x86_64 machines have SSE2, so don't even bother checking. */
+static inline bool hasSSE2() {
+    return true;
+}
+#else
 
 static inline bool hasSSE2() {
     int cpu_info[4] = { 0 };
@@ -55,19 +68,50 @@
 }
 #endif
 
+static inline bool hasSSSE3() {
+    int cpu_info[4] = { 0 };
+    getcpuid(1, cpu_info);
+    return (cpu_info[2] & 0x200) != 0;
+}
+
 static bool cachedHasSSE2() {
     static bool gHasSSE2 = hasSSE2();
     return gHasSSE2;
 }
 
+static bool cachedHasSSSE3() {
+    static bool gHasSSSE3 = hasSSSE3();
+    return gHasSSSE3;
+}
+
 void SkBitmapProcState::platformProcs() {
-    if (cachedHasSSE2()) {
+    if (cachedHasSSSE3()) {
+        if (fSampleProc32 == S32_opaque_D32_filter_DX) {
+            fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3;
+        } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
+            fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3;
+        }
+    } else if (cachedHasSSE2()) {
         if (fSampleProc32 == S32_opaque_D32_filter_DX) {
             fSampleProc32 = S32_opaque_D32_filter_DX_SSE2;
         } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
             fSampleProc32 = S32_alpha_D32_filter_DX_SSE2;
         }
     }
+
+    if (cachedHasSSSE3() || cachedHasSSE2()) {
+        if (fMatrixProc == ClampX_ClampY_filter_scale) {
+            fMatrixProc = ClampX_ClampY_filter_scale_SSE2;
+        } else if (fMatrixProc == ClampX_ClampY_nofilter_scale) {
+            fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2;
+        }
+
+        if (fMatrixProc == ClampX_ClampY_filter_affine) {
+            fMatrixProc = ClampX_ClampY_filter_affine_SSE2;
+        } else if (fMatrixProc == ClampX_ClampY_nofilter_affine) {
+            fMatrixProc = ClampX_ClampY_nofilter_affine_SSE2;
+        }
+    }
 }
 
 static SkBlitRow::Proc32 platform_32_procs[] = {
@@ -126,6 +170,18 @@
     return proc;
 }
 
+SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
+    if (cachedHasSSE2()) {
+        if (isOpaque) {
+            return SkBlitLCD16OpaqueRow_SSE2;
+        } else {
+            return SkBlitLCD16Row_SSE2;
+        }
+    } else {
+        return NULL;
+    }
+
+}
 SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig,
                                                  SkMask::Format maskFormat,
                                                  RowFlags flags) {
diff --git a/src/opts/opts_check_arm.cpp b/src/opts/opts_check_arm.cpp
index 49e3096..20ec8a1 100644
--- a/src/opts/opts_check_arm.cpp
+++ b/src/opts/opts_check_arm.cpp
@@ -28,8 +28,8 @@
 SkMemset16Proc SkMemset16GetPlatformProc() {
 #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
     return memset16_neon;
-//#elif defined(SK_CPU_LENDIAN)
-//    return arm_memset16;
+#elif defined(SK_CPU_LENDIAN)
+    return arm_memset16;
 #else
     return NULL;
 #endif
diff --git a/src/pdf/SkPDFFont.cpp b/src/pdf/SkPDFFont.cpp
index 465fbe1..3aea4b8 100644
--- a/src/pdf/SkPDFFont.cpp
+++ b/src/pdf/SkPDFFont.cpp
@@ -797,9 +797,10 @@
 }
 
 // static
-SkMutex& SkPDFFont::CanonicalFontsMutex() {
-    // This initialization is only thread safe with gcc.
-    static SkMutex gCanonicalFontsMutex;
+SkBaseMutex& SkPDFFont::CanonicalFontsMutex() {
+    // This initialization is only thread safe with gcc, or when
+    // POD-style mutex initialization is used.
+    SK_DECLARE_STATIC_MUTEX(gCanonicalFontsMutex);
     return gCanonicalFontsMutex;
 }
 
diff --git a/src/pdf/SkPDFGraphicState.cpp b/src/pdf/SkPDFGraphicState.cpp
index ad3f57b..ec9b0e7 100644
--- a/src/pdf/SkPDFGraphicState.cpp
+++ b/src/pdf/SkPDFGraphicState.cpp
@@ -85,9 +85,10 @@
 }
 
 // static
-SkMutex& SkPDFGraphicState::CanonicalPaintsMutex() {
-    // This initialization is only thread safe with gcc.
-    static SkMutex gCanonicalPaintsMutex;
+SkBaseMutex& SkPDFGraphicState::CanonicalPaintsMutex() {
+    // This initialization is only thread safe with gcc or when
+    // POD-style mutex initialization is used.
+    SK_DECLARE_STATIC_MUTEX(gCanonicalPaintsMutex);
     return gCanonicalPaintsMutex;
 }
 
diff --git a/src/pdf/SkPDFShader.cpp b/src/pdf/SkPDFShader.cpp
index b6e0939..183a4ff 100644
--- a/src/pdf/SkPDFShader.cpp
+++ b/src/pdf/SkPDFShader.cpp
@@ -391,9 +391,10 @@
 }
 
 // static
-SkMutex& SkPDFShader::CanonicalShadersMutex() {
-    // This initialization is only thread safe with gcc.
-    static SkMutex gCanonicalShadersMutex;
+SkBaseMutex& SkPDFShader::CanonicalShadersMutex() {
+    // This initialization is only thread safe with gcc or when
+    // POD-style mutex initialization is used.
+    SK_DECLARE_STATIC_MUTEX(gCanonicalShadersMutex);
     return gCanonicalShadersMutex;
 }
 
diff --git a/src/pipe/SkGPipeWrite.cpp b/src/pipe/SkGPipeWrite.cpp
index 00d5d5f..ee6e2c7 100644
--- a/src/pipe/SkGPipeWrite.cpp
+++ b/src/pipe/SkGPipeWrite.cpp
@@ -112,14 +112,14 @@
                                   const SkPaint*);
     virtual void drawSprite(const SkBitmap&, int left, int top,
                             const SkPaint*);
-    virtual void drawText(const void* text, size_t byteLength, SkScalar x, 
+    virtual void drawText(const void* text, size_t byteLength, SkScalar x,
                           SkScalar y, const SkPaint&);
-    virtual void drawPosText(const void* text, size_t byteLength, 
+    virtual void drawPosText(const void* text, size_t byteLength,
                              const SkPoint pos[], const SkPaint&);
     virtual void drawPosTextH(const void* text, size_t byteLength,
                       const SkScalar xpos[], SkScalar constY, const SkPaint&);
-    virtual void drawTextOnPath(const void* text, size_t byteLength, 
-                            const SkPath& path, const SkMatrix* matrix, 
+    virtual void drawTextOnPath(const void* text, size_t byteLength,
+                            const SkPath& path, const SkMatrix* matrix,
                                 const SkPaint&);
     virtual void drawPicture(SkPicture& picture);
     virtual void drawVertices(VertexMode, int vertexCount,
@@ -144,7 +144,7 @@
     inline void writeOp(DrawOps op, unsigned flags, unsigned data) {
         fWriter.write32(DrawOp_packOpFlagData(op, flags, data));
     }
-    
+
     inline void writeOp(DrawOps op) {
         fWriter.write32(DrawOp_packOpFlagData(op, 0, 0));
     }
@@ -164,7 +164,7 @@
         uint32_t    fSize;
 
         void*       data() { return (char*)this + sizeof(*this); }
-        
+
         static int Compare(const FlatData* a, const FlatData* b) {
             return memcmp(&a->fSize, &b->fSize, a->fSize + sizeof(a->fSize));
         }
@@ -307,7 +307,7 @@
     NOTIFY_SETUP(this);
     size_t size = 0;
     unsigned opFlags = 0;
-    
+
     if (bounds) {
         opFlags |= kSaveLayer_HasBounds_DrawOpFlag;
         size += sizeof(SkRect);
@@ -323,7 +323,7 @@
             fWriter.writeRect(*bounds);
         }
     }
-    
+
     // we just pass on the save, so we don't create a layer
     return this->INHERITED::save(saveFlags);
 }
@@ -506,7 +506,7 @@
     UNIMPLEMENTED
 }
 
-void SkGPipeCanvas::drawText(const void* text, size_t byteLength, SkScalar x, 
+void SkGPipeCanvas::drawText(const void* text, size_t byteLength, SkScalar x,
                                  SkScalar y, const SkPaint& paint) {
     if (byteLength) {
         NOTIFY_SETUP(this);
@@ -521,7 +521,7 @@
     }
 }
 
-void SkGPipeCanvas::drawPosText(const void* text, size_t byteLength, 
+void SkGPipeCanvas::drawPosText(const void* text, size_t byteLength,
                                 const SkPoint pos[], const SkPaint& paint) {
     if (byteLength) {
         NOTIFY_SETUP(this);
@@ -555,8 +555,8 @@
     }
 }
 
-void SkGPipeCanvas::drawTextOnPath(const void* text, size_t byteLength, 
-                                   const SkPath& path, const SkMatrix* matrix, 
+void SkGPipeCanvas::drawTextOnPath(const void* text, size_t byteLength,
+                                   const SkPath& path, const SkMatrix* matrix,
                                    const SkPaint& paint) {
     if (byteLength) {
         NOTIFY_SETUP(this);
@@ -611,7 +611,7 @@
         flags |= kDrawVertices_HasIndices_DrawOpFlag;
         size += 4 + SkAlign4(indexCount * sizeof(uint16_t));
     }
-    
+
     if (this->needOpBytes(size)) {
         this->writeOp(kDrawVertices_DrawOp, flags, 0);
         fWriter.write32(mode);
diff --git a/src/ports/SkFontHost_FONTPATH.cpp b/src/ports/SkFontHost_FONTPATH.cpp
index f0438f4..bd8f102 100644
--- a/src/ports/SkFontHost_FONTPATH.cpp
+++ b/src/ports/SkFontHost_FONTPATH.cpp
@@ -279,10 +279,6 @@
     return NULL;
 }
 
-bool SkFontHost::ValidFontID(uint32_t fontID) {
-    return get_id(*get_default_face()) == fontID;
-}
-
 SkStream* SkFontHost::OpenStream(uint32_t fontID) {
     sk_throw();  // not implemented
     return NULL;
diff --git a/src/ports/SkFontHost_FreeType.cpp b/src/ports/SkFontHost_FreeType.cpp
index 61efb95..da1040d 100644
--- a/src/ports/SkFontHost_FreeType.cpp
+++ b/src/ports/SkFontHost_FreeType.cpp
@@ -54,6 +54,15 @@
 //#define ENABLE_GLYPH_SPEW     // for tracing calls
 //#define DUMP_STRIKE_CREATION
 
+//#define SK_GAMMA_APPLY_TO_A8
+
+#ifndef SK_GAMMA_CONTRAST
+    #define SK_GAMMA_CONTRAST   0x66
+#endif
+#ifndef SK_GAMMA_EXPONENT
+    #define SK_GAMMA_EXPONENT   2.2
+#endif
+
 #ifdef SK_DEBUG
     #define SkASSERT_CONTINUE(pred)                                                         \
         do {                                                                                \
@@ -66,14 +75,6 @@
 
 using namespace skia_advanced_typeface_metrics_utils;
 
-// SK_FREETYPE_LCD_LERP should be 0...256
-//   0 means no color reduction (e.g. just as returned from FreeType)
-//   256 means 100% color reduction (e.g. gray)
-//
-#ifndef SK_FREETYPE_LCD_LERP
-    #define SK_FREETYPE_LCD_LERP    96
-#endif
-
 static bool isLCD(const SkScalerContext::Rec& rec) {
     switch (rec.fMaskFormat) {
         case SkMask::kLCD16_Format:
@@ -88,12 +89,13 @@
 
 struct SkFaceRec;
 
-static SkMutex      gFTMutex;
+SK_DECLARE_STATIC_MUTEX(gFTMutex);
 static int          gFTCount;
 static FT_Library   gFTLibrary;
 static SkFaceRec*   gFaceRecHead;
 static bool         gLCDSupportValid;  // true iff |gLCDSupport| has been set.
 static bool         gLCDSupport;  // true iff LCD is supported by the runtime.
+static int          gLCDExtra;  // number of extra pixels for filtering.
 
 static const uint8_t* gGammaTables[2];
 
@@ -113,8 +115,12 @@
     // Setup LCD filtering. This reduces colour fringes for LCD rendered
     // glyphs.
 #ifdef FT_LCD_FILTER_H
-    err = FT_Library_SetLcdFilter(gFTLibrary, FT_LCD_FILTER_DEFAULT);
+//    err = FT_Library_SetLcdFilter(gFTLibrary, FT_LCD_FILTER_DEFAULT);
+    err = FT_Library_SetLcdFilter(gFTLibrary, FT_LCD_FILTER_LIGHT);
     gLCDSupport = err == 0;
+    if (gLCDSupport) {
+        gLCDExtra = 2; //DEFAULT and LIGHT add one pixel to each side.
+    }
 #else
     gLCDSupport = false;
 #endif
@@ -630,11 +636,13 @@
     if (SkPaint::kFull_Hinting == h && !isLCD(*rec)) {
         // collapse full->normal hinting if we're not doing LCD
         h = SkPaint::kNormal_Hinting;
-    } else if ((rec->fFlags & SkScalerContext::kSubpixelPositioning_Flag) &&
-               SkPaint::kNo_Hinting != h) {
-        // to do subpixel, we must have at most slight hinting
-        h = SkPaint::kSlight_Hinting;
     }
+    if ((rec->fFlags & SkScalerContext::kSubpixelPositioning_Flag) || isLCD(*rec)) {
+        if (SkPaint::kNo_Hinting != h) {
+            h = SkPaint::kSlight_Hinting;
+        }
+    }
+
 #ifndef SK_IGNORE_ROTATED_FREETYPE_FIX
     // rotated text looks bad with hinting, so we disable it as needed
     if (!isAxisAligned(*rec)) {
@@ -643,6 +651,7 @@
 #endif
     rec->setHinting(h);
 
+#ifndef SK_USE_COLOR_LUMINANCE
     // for compatibility at the moment, discretize luminance to 3 settings
     // black, white, gray. This helps with fontcache utilization, since we
     // won't create multiple entries that in the end map to the same results.
@@ -662,6 +671,7 @@
         }
         rec->setLuminanceBits(lum);
     }
+#endif
 }
 
 #ifdef SK_BUILD_FOR_ANDROID
@@ -857,17 +867,6 @@
     this face with other context (at different sizes).
 */
 FT_Error SkScalerContext_FreeType::setupSize() {
-    /*  In the off-chance that a font has been removed, we want to error out
-        right away, so call resolve just to be sure.
-
-        TODO: perhaps we can skip this, by walking the global font cache and
-        killing all of the contexts when we know that a given fontID is going
-        away...
-     */
-    if (!SkFontHost::ValidFontID(fRec.fFontID)) {
-        return (FT_Error)-1;
-    }
-
     FT_Error    err = FT_Activate_Size(fFTSize);
 
     if (err != 0) {
@@ -1012,6 +1011,11 @@
         glyph->fHeight  = SkToU16((bbox.yMax - bbox.yMin) >> 6);
         glyph->fTop     = -SkToS16(bbox.yMax >> 6);
         glyph->fLeft    = SkToS16(bbox.xMin >> 6);
+
+        if (isLCD(fRec)) {
+            glyph->fWidth += gLCDExtra;
+            glyph->fLeft -= gLCDExtra >> 1;
+        }
         break;
       }
 
@@ -1049,19 +1053,84 @@
 #endif
 }
 
-static int lerp(int start, int end) {
-    SkASSERT((unsigned)SK_FREETYPE_LCD_LERP <= 256);
-    return start + ((end - start) * (SK_FREETYPE_LCD_LERP) >> 8);
+///////////////////////////////////////////////////////////////////////////////
+
+static int apply_contrast(int srca, int contrast) {
+    return srca + (((255 - srca) * contrast * srca) / (255*255));
 }
 
-static uint16_t packTriple(unsigned r, unsigned g, unsigned b) {
-    if (SK_FREETYPE_LCD_LERP) {
-        // want (a+b+c)/3, but we approx to avoid the divide
-        unsigned ave = (5 * (r + g + b) + b) >> 4;
-        r = lerp(r, ave);
-        g = lerp(g, ave);
-        b = lerp(b, ave);
+static void build_power_table(uint8_t table[], float ee) {
+    for (int i = 0; i < 256; i++) {
+        float x = i / 255.f;
+        x = powf(x, ee);
+        int xx = SkScalarRoundToInt(SkFloatToScalar(x * 255));
+        table[i] = SkToU8(xx);
     }
+}
+
+static void build_gamma_table(uint8_t table[256], int src, int dst) {
+    static bool gInit;
+    static uint8_t powTable[256], invPowTable[256];
+    if (!gInit) {
+        const float g = SK_GAMMA_EXPONENT;
+        build_power_table(powTable, g);
+        build_power_table(invPowTable, 1/g);
+        gInit = true;
+    }
+
+    const int linSrc = powTable[src];
+    const int linDst = powTable[dst];
+    // have our contrast value taper off to 0 as the src luminance becomes white
+    const int contrast = SK_GAMMA_CONTRAST * (255 - linSrc) / 255;
+    
+    for (int i = 0; i < 256; ++i) {
+        int srca = apply_contrast(i, contrast);
+        SkASSERT((unsigned)srca <= 255);
+        int dsta = 255 - srca;
+
+        //Calculate the output we want.
+        int linOut = (linSrc * srca + dsta * linDst) / 255;
+        SkASSERT((unsigned)linOut <= 255);
+        int out = invPowTable[linOut];
+
+        //Undo what the blit blend will do.
+        int result = ((255 * out) - (255 * dst)) / (src - dst);
+        SkASSERT((unsigned)result <= 255);
+
+        table[i] = result;
+    }
+}
+
+static const uint8_t* getGammaTable(U8CPU luminance) {
+    static uint8_t gGammaTables[4][256];
+    static bool gInited;
+    if (!gInited) {
+        build_gamma_table(gGammaTables[0], 0x00, 0xFF);
+        build_gamma_table(gGammaTables[1], 0x66, 0x99);
+        build_gamma_table(gGammaTables[2], 0x99, 0x66);
+        build_gamma_table(gGammaTables[3], 0xFF, 0x00);
+
+        gInited = true;
+    }
+    SkASSERT(0 == (luminance >> 8));
+    return gGammaTables[luminance >> 6];
+}
+
+#ifndef SK_USE_COLOR_LUMINANCE
+static const uint8_t* getIdentityTable() {
+    static bool gOnce;
+    static uint8_t gIdentityTable[256];
+    if (!gOnce) {
+        for (int i = 0; i < 256; ++i) {
+            gIdentityTable[i] = i;
+        }
+        gOnce = true;
+    }
+    return gIdentityTable;
+}
+#endif
+
+static uint16_t packTriple(unsigned r, unsigned g, unsigned b) {
     return SkPackRGB16(r >> 3, g >> 2, b >> 3);
 }
 
@@ -1077,7 +1146,8 @@
 }
 
 static void copyFT2LCD16(const SkGlyph& glyph, const FT_Bitmap& bitmap,
-                         int lcdIsBGR) {
+                         int lcdIsBGR, const uint8_t* tableR,
+                         const uint8_t* tableG, const uint8_t* tableB) {
     SkASSERT(glyph.fHeight == bitmap.rows);
     uint16_t* dst = reinterpret_cast<uint16_t*>(glyph.fImage);
     const size_t dstRB = glyph.rowBytes();
@@ -1104,18 +1174,21 @@
             }
         } break;
         default: {
-            SkASSERT(glyph.fWidth * 3 == bitmap.width - 6);
-            src += 3;
+            SkASSERT(glyph.fWidth * 3 == bitmap.width);
             for (int y = 0; y < glyph.fHeight; y++) {
                 const uint8_t* triple = src;
                 if (lcdIsBGR) {
                     for (int x = 0; x < width; x++) {
-                        dst[x] = packTriple(triple[2], triple[1], triple[0]);
+                        dst[x] = packTriple(tableR[triple[2]], 
+                                            tableG[triple[1]],
+                                            tableB[triple[0]]);
                         triple += 3;
                     }
                 } else {
                     for (int x = 0; x < width; x++) {
-                        dst[x] = packTriple(triple[0], triple[1], triple[2]);
+                        dst[x] = packTriple(tableR[triple[0]], 
+                                            tableG[triple[1]],
+                                            tableB[triple[2]]);
                         triple += 3;
                     }
                 }
@@ -1144,6 +1217,26 @@
         return;
     }
 
+#ifdef SK_USE_COLOR_LUMINANCE
+    SkColor lumColor = fRec.getLuminanceColor();
+    const uint8_t* tableR = getGammaTable(SkColorGetR(lumColor));
+    const uint8_t* tableG = getGammaTable(SkColorGetG(lumColor));
+    const uint8_t* tableB = getGammaTable(SkColorGetB(lumColor));
+#else
+    unsigned lum = fRec.getLuminanceByte();
+    const uint8_t* tableR;
+    const uint8_t* tableG;
+    const uint8_t* tableB;
+
+    bool isWhite = lum >= WHITE_LUMINANCE_LIMIT;
+    bool isBlack = lum <= BLACK_LUMINANCE_LIMIT;
+    if ((gGammaTables[0] || gGammaTables[1]) && (isBlack || isWhite)) {
+        tableR = tableG = tableB = gGammaTables[isBlack ? 0 : 1];
+    } else {
+        tableR = tableG = tableB = getIdentityTable();
+    }
+#endif
+
     switch ( fFace->glyph->format ) {
         case FT_GLYPH_FORMAT_OUTLINE: {
             FT_Outline* outline = &fFace->glyph->outline;
@@ -1176,7 +1269,8 @@
             if (SkMask::kLCD16_Format == glyph.fMaskFormat) {
                 FT_Render_Glyph(fFace->glyph, FT_RENDER_MODE_LCD);
                 copyFT2LCD16(glyph, fFace->glyph->bitmap,
-                             fRec.fFlags & SkScalerContext::kLCD_BGROrder_Flag);
+                             fRec.fFlags & SkScalerContext::kLCD_BGROrder_Flag,
+                             tableR, tableG, tableB);
             } else {
                 target.width = glyph.fWidth;
                 target.rows = glyph.fHeight;
@@ -1242,7 +1336,8 @@
                 }
             } else if (SkMask::kLCD16_Format == glyph.fMaskFormat) {
                 copyFT2LCD16(glyph, fFace->glyph->bitmap,
-                             fRec.fFlags & SkScalerContext::kLCD_BGROrder_Flag);
+                             fRec.fFlags & SkScalerContext::kLCD_BGROrder_Flag,
+                             tableR, tableG, tableB);
             } else {
                 SkDEBUGFAIL("unknown glyph bitmap transform needed");
             }
@@ -1253,25 +1348,23 @@
         goto ERROR;
     }
 
-    if (gGammaTables[0] || gGammaTables[1]) {
-        bool isWhite = fRec.getLuminanceByte() >= WHITE_LUMINANCE_LIMIT;
-        bool isBlack = fRec.getLuminanceByte() <= BLACK_LUMINANCE_LIMIT;
-        if ((isWhite | isBlack) && SkMask::kA8_Format == glyph.fMaskFormat) {
-            int index = isBlack ? 0 : 1;
-            if (gGammaTables[index]) {
-                const uint8_t* SK_RESTRICT table = gGammaTables[index];
-                uint8_t* SK_RESTRICT dst = (uint8_t*)glyph.fImage;
-                unsigned rowBytes = glyph.rowBytes();
-                
-                for (int y = glyph.fHeight - 1; y >= 0; --y) {
-                    for (int x = glyph.fWidth - 1; x >= 0; --x) {
-                        dst[x] = table[dst[x]];
-                    }
-                    dst += rowBytes;
-                }
+// We used to always do this pre-USE_COLOR_LUMINANCE, but with colorlum,
+// it is optional
+#if defined(SK_GAMMA_APPLY_TO_A8) || !defined(SK_USE_COLOR_LUMINANCE)
+    if (SkMask::kA8_Format == glyph.fMaskFormat) {
+        SkASSERT(tableR == tableG && tableR == tableB);
+        const uint8_t* table = tableR;
+        uint8_t* SK_RESTRICT dst = (uint8_t*)glyph.fImage;
+        unsigned rowBytes = glyph.rowBytes();
+        
+        for (int y = glyph.fHeight - 1; y >= 0; --y) {
+            for (int x = glyph.fWidth - 1; x >= 0; --x) {
+                dst[x] = table[dst[x]];
             }
+            dst += rowBytes;
         }
     }
+#endif
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1422,7 +1515,7 @@
                 emboldenOutline(&fFace->glyph->outline);
             }
             FT_Outline_Get_CBox(&fFace->glyph->outline, &bbox);
-            x_height = SkIntToScalar(bbox.yMax) / 64;
+            x_height = SkFixedToScalar(SkFDot6ToFixed(bbox.yMax));
         } else {
             x_height = 0;
         }
@@ -1477,12 +1570,11 @@
 /*  Export this so that other parts of our FonttHost port can make use of our
     ability to extract the name+style from a stream, using FreeType's api.
 */
-SkTypeface::Style find_name_and_attributes(SkStream* stream, SkString* name,
-                                           bool* isFixedWidth) {
+bool find_name_and_attributes(SkStream* stream, SkString* name,
+                              SkTypeface::Style* style, bool* isFixedWidth) {
     FT_Library  library;
     if (FT_Init_FreeType(&library)) {
-        name->reset();
-        return SkTypeface::kNormal;
+        return false;
     }
 
     FT_Open_Args    args;
@@ -1509,18 +1601,22 @@
     FT_Face face;
     if (FT_Open_Face(library, &args, 0, &face)) {
         FT_Done_FreeType(library);
-        name->reset();
-        return SkTypeface::kNormal;
+        return false;
     }
 
-    name->set(face->family_name);
-    int style = SkTypeface::kNormal;
-
+    int tempStyle = SkTypeface::kNormal;
     if (face->style_flags & FT_STYLE_FLAG_BOLD) {
-        style |= SkTypeface::kBold;
+        tempStyle |= SkTypeface::kBold;
     }
     if (face->style_flags & FT_STYLE_FLAG_ITALIC) {
-        style |= SkTypeface::kItalic;
+        tempStyle |= SkTypeface::kItalic;
+    }
+
+    if (name) {
+        name->set(face->family_name);
+    }
+    if (style) {
+        *style = (SkTypeface::Style) tempStyle;
     }
     if (isFixedWidth) {
         *isFixedWidth = FT_IS_FIXED_WIDTH(face);
@@ -1528,5 +1624,5 @@
 
     FT_Done_Face(face);
     FT_Done_FreeType(library);
-    return (SkTypeface::Style)style;
+    return true;
 }
diff --git a/src/ports/SkFontHost_android.cpp b/src/ports/SkFontHost_android.cpp
index b07d93d..1856cff 100644
--- a/src/ports/SkFontHost_android.cpp
+++ b/src/ports/SkFontHost_android.cpp
@@ -32,8 +32,8 @@
     #define SK_FONT_FILE_PREFIX          "/fonts/"
 #endif
 
-SkTypeface::Style find_name_and_attributes(SkStream* stream, SkString* name,
-                                           bool* isFixedWidth);
+bool find_name_and_attributes(SkStream* stream, SkString* name,
+                              SkTypeface::Style* style, bool* isFixedWidth);
 
 static void GetFullPathForSysFonts(SkString* full, const char name[]) {
     full->set(getenv("ANDROID_ROOT"));
@@ -64,14 +64,27 @@
         // we don't own family, so just ignore our reference
     }
 };
+typedef SkTDArray<NameFamilyPair> NameFamilyPairList;
 
 // we use atomic_inc to grow this for each typeface we create
 static int32_t gUniqueFontID;
 
-// this is the mutex that protects these globals
-static SkMutex gFamilyMutex;
+// this is the mutex that protects gFamilyHead and GetNameList()
+SK_DECLARE_STATIC_MUTEX(gFamilyHeadAndNameListMutex);
 static FamilyRec* gFamilyHead;
-static SkTDArray<NameFamilyPair> gNameList;
+
+static NameFamilyPairList& GetNameList() {
+    /*
+     *  It is assumed that the caller has already acquired a lock on
+     *  gFamilyHeadAndNameListMutex before calling this.
+     */
+    static NameFamilyPairList* gNameList;
+    if (NULL == gNameList) {
+        gNameList = SkNEW(NameFamilyPairList);
+        // register a delete proc with sk_atexit(..) when available
+    }
+    return *gNameList;
+}
 
 struct FamilyRec {
     FamilyRec*  fNext;
@@ -108,7 +121,7 @@
         }
     }
     // should never get here, since the faces list should not be empty
-    SkASSERT(!"faces list is empty");
+    SkDEBUGFAIL("faces list is empty");
     return NULL;
 }
 
@@ -184,9 +197,11 @@
     SkASSERT(!"Yikes, couldn't find family in our list to remove/delete");
 }
 
+//  gFamilyHeadAndNameListMutex must already be acquired
 static SkTypeface* find_typeface(const char name[], SkTypeface::Style style) {
-    NameFamilyPair* list = gNameList.begin();
-    int             count = gNameList.count();
+    NameFamilyPairList& namelist = GetNameList();
+    NameFamilyPair* list = namelist.begin();
+    int             count = namelist.count();
 
     int index = SkStrLCSearch(&list[0].fName, count, name, sizeof(list[0]));
 
@@ -196,36 +211,39 @@
     return NULL;
 }
 
+//  gFamilyHeadAndNameListMutex must already be acquired
 static SkTypeface* find_typeface(const SkTypeface* familyMember,
                                  SkTypeface::Style style) {
     const FamilyRec* family = find_family(familyMember);
     return family ? find_best_face(family, style) : NULL;
 }
 
+//  gFamilyHeadAndNameListMutex must already be acquired
 static void add_name(const char name[], FamilyRec* family) {
     SkAutoAsciiToLC tolc(name);
     name = tolc.lc();
 
-    NameFamilyPair* list = gNameList.begin();
-    int             count = gNameList.count();
+    NameFamilyPairList& namelist = GetNameList();
+    NameFamilyPair* list = namelist.begin();
+    int             count = namelist.count();
 
     int index = SkStrLCSearch(&list[0].fName, count, name, sizeof(list[0]));
 
     if (index < 0) {
-        list = gNameList.insert(~index);
+        list = namelist.insert(~index);
         list->construct(name, family);
     }
 }
 
-static void remove_from_names(FamilyRec* emptyFamily)
-{
+//  gFamilyHeadAndNameListMutex must already be acquired
+static void remove_from_names(FamilyRec* emptyFamily) {
 #ifdef SK_DEBUG
     for (int i = 0; i < 4; i++) {
         SkASSERT(emptyFamily->fFaces[i] == NULL);
     }
 #endif
 
-    SkTDArray<NameFamilyPair>& list = gNameList;
+    SkTDArray<NameFamilyPair>& list = GetNameList();
 
     // must go backwards when removing
     for (int i = list.count() - 1; i >= 0; --i) {
@@ -246,8 +264,8 @@
     : SkTypeface(style, sk_atomic_inc(&gUniqueFontID) + 1, isFixedWidth) {
         fIsSysFont = sysFont;
 
-        SkAutoMutexAcquire  ac(gFamilyMutex);
 
+        // our caller has acquired the gFamilyHeadAndNameListMutex so this is safe
         FamilyRec* rec = NULL;
         if (familyMember) {
             rec = find_family(familyMember);
@@ -259,7 +277,7 @@
     }
 
     virtual ~FamilyTypeface() {
-        SkAutoMutexAcquire  ac(gFamilyMutex);
+        SkAutoMutexAcquire  ac(gFamilyHeadAndNameListMutex);
 
         // remove us from our family. If the family is now empty, we return
         // that and then remove that family from the name list
@@ -373,14 +391,12 @@
 
     SkMMAPStream stream(fullpath.c_str());
     if (stream.getLength() > 0) {
-        *style = find_name_and_attributes(&stream, name, isFixedWidth);
-        return true;
+        return find_name_and_attributes(&stream, name, style, isFixedWidth);
     }
     else {
         SkFILEStream stream(fullpath.c_str());
         if (stream.getLength() > 0) {
-            *style = find_name_and_attributes(&stream, name, isFixedWidth);
-            return true;
+            return find_name_and_attributes(&stream, name, style, isFixedWidth);
         }
     }
 
@@ -473,8 +489,11 @@
     fontFamilies.deleteAll();
 }
 
-/*  Called once (ensured by the sentinel check at the beginning of our body).
-    Initializes all the globals, and register the system fonts.
+/*
+ *  Called once (ensured by the sentinel check at the beginning of our body).
+ *  Initializes all the globals, and register the system fonts.
+ *
+ *  gFamilyHeadAndNameListMutex must already be acquired.
  */
 static void load_system_fonts() {
     // check if we've already be called
@@ -588,6 +607,8 @@
 }
 
 SkTypeface* SkFontHost::Deserialize(SkStream* stream) {
+    SkAutoMutexAcquire  ac(gFamilyHeadAndNameListMutex);
+
     load_system_fonts();
 
     // check if the font is a custom or system font
@@ -642,9 +663,9 @@
                                        const char familyName[],
                                        const void* data, size_t bytelength,
                                        SkTypeface::Style style) {
-    load_system_fonts();
+    SkAutoMutexAcquire  ac(gFamilyHeadAndNameListMutex);
 
-    SkAutoMutexAcquire  ac(gFamilyMutex);
+    load_system_fonts();
 
     // clip to legal style bits
     style = (SkTypeface::Style)(style & SkTypeface::kBoldItalic);
@@ -667,14 +688,8 @@
     return tf;
 }
 
-bool SkFontHost::ValidFontID(uint32_t fontID) {
-    SkAutoMutexAcquire  ac(gFamilyMutex);
-
-    return find_from_uniqueID(fontID) != NULL;
-}
-
 SkStream* SkFontHost::OpenStream(uint32_t fontID) {
-    SkAutoMutexAcquire  ac(gFamilyMutex);
+    SkAutoMutexAcquire  ac(gFamilyHeadAndNameListMutex);
 
     FamilyTypeface* tf = (FamilyTypeface*)find_from_uniqueID(fontID);
     SkStream* stream = tf ? tf->openStream() : NULL;
@@ -688,7 +703,7 @@
 
 size_t SkFontHost::GetFileName(SkFontID fontID, char path[], size_t length,
                                int32_t* index) {
-    SkAutoMutexAcquire  ac(gFamilyMutex);
+    SkAutoMutexAcquire  ac(gFamilyHeadAndNameListMutex);
 
     FamilyTypeface* tf = (FamilyTypeface*)find_from_uniqueID(fontID);
     const char* src = tf ? tf->getFilePath() : NULL;
@@ -708,6 +723,8 @@
 }
 
 SkFontID SkFontHost::NextLogicalFont(SkFontID currFontID, SkFontID origFontID) {
+    SkAutoMutexAcquire  ac(gFamilyHeadAndNameListMutex);
+
     load_system_fonts();
 
     const SkTypeface* origTypeface = find_from_uniqueID(origFontID);
@@ -749,10 +766,10 @@
     }
 
     bool isFixedWidth;
-    SkString name;
-    SkTypeface::Style style = find_name_and_attributes(stream, &name, &isFixedWidth);
+    SkTypeface::Style style;
 
-    if (!name.isEmpty()) {
+    if (find_name_and_attributes(stream, NULL, &style, &isFixedWidth)) {
+        SkAutoMutexAcquire  ac(gFamilyHeadAndNameListMutex);
         return SkNEW_ARGS(StreamTypeface, (style, false, NULL, stream, isFixedWidth));
     } else {
         return NULL;
diff --git a/src/ports/SkFontHost_fontconfig.cpp b/src/ports/SkFontHost_fontconfig.cpp
index acc5ae0..775a7c7 100644
--- a/src/ports/SkFontHost_fontconfig.cpp
+++ b/src/ports/SkFontHost_fontconfig.cpp
@@ -38,7 +38,7 @@
 // Although truetype fonts can support multiple faces in a single file, at the
 // moment Skia doesn't.
 // -----------------------------------------------------------------------------
-static SkMutex global_fc_map_lock;
+SK_DECLARE_STATIC_MUTEX(global_fc_map_lock);
 static std::map<std::string, unsigned> global_fc_map;
 static std::map<unsigned, std::string> global_fc_map_inverted;
 static std::map<uint32_t, SkTypeface *> global_fc_typefaces;
@@ -304,12 +304,6 @@
 }
 
 // static
-bool SkFontHost::ValidFontID(SkFontID uniqueID) {
-    SkAutoMutexAcquire ac(global_fc_map_lock);
-    return global_fc_typefaces.find(uniqueID) != global_fc_typefaces.end();
-}
-
-// static
 SkStream* SkFontHost::OpenStream(uint32_t id)
 {
     SkAutoMutexAcquire ac(global_fc_map_lock);
diff --git a/src/ports/SkFontHost_freetype_mac.cpp b/src/ports/SkFontHost_freetype_mac.cpp
index 140098d..e51f802 100644
--- a/src/ports/SkFontHost_freetype_mac.cpp
+++ b/src/ports/SkFontHost_freetype_mac.cpp
@@ -65,10 +65,6 @@
     return create_from_path(path);
 }
 
-bool SkFontHost::ValidFontID(SkFontID fontID) {
-    return SkTypefaceCache::FindByID(fontID) != NULL;
-}
-
 SkStream* SkFontHost::OpenStream(uint32_t fontID) {
     FTMacTypeface* tf = (FTMacTypeface*)SkTypefaceCache::FindByID(fontID);
     if (tf) {
diff --git a/src/ports/SkFontHost_linux.cpp b/src/ports/SkFontHost_linux.cpp
index c87b036..be99576 100644
--- a/src/ports/SkFontHost_linux.cpp
+++ b/src/ports/SkFontHost_linux.cpp
@@ -22,8 +22,8 @@
     #define SK_FONT_FILE_PREFIX      "/usr/share/fonts/truetype/msttcorefonts/"
 #endif
 
-SkTypeface::Style find_name_and_attributes(SkStream* stream, SkString* name,
-                                           bool* isFixedWidth);
+bool find_name_and_attributes(SkStream* stream, SkString* name,
+                              SkTypeface::Style* style, bool* isFixedWidth);
 
 static void GetFullPathForSysFonts(SkString* full, const char name[])
 {
@@ -60,7 +60,7 @@
 static int32_t gUniqueFontID;
 
 // this is the mutex that protects these globals
-static SkMutex gFamilyMutex;
+SK_DECLARE_STATIC_MUTEX(gFamilyMutex);
 static FamilyRec* gFamilyHead;
 static SkTDArray<NameFamilyPair> gNameList;
 
@@ -358,14 +358,12 @@
                                SkTypeface::Style* style, bool* isFixedWidth) {    
     SkMMAPStream stream(path);
     if (stream.getLength() > 0) {
-        *style = find_name_and_attributes(&stream, name, isFixedWidth);
-        return true;
+        return find_name_and_attributes(&stream, name, style, isFixedWidth);
     }
     else {
         SkFILEStream stream(path);
         if (stream.getLength() > 0) {
-            *style = find_name_and_attributes(&stream, name, isFixedWidth);
-            return true;
+            return find_name_and_attributes(&stream, name, style, isFixedWidth);
         }
     }
     
@@ -545,12 +543,6 @@
     return tf;
 }
 
-bool SkFontHost::ValidFontID(uint32_t fontID) {
-    SkAutoMutexAcquire  ac(gFamilyMutex);
-    
-    return valid_uniqueID(fontID);
-}
-
 SkStream* SkFontHost::OpenStream(uint32_t fontID) {
     FamilyTypeface* tf = (FamilyTypeface*)find_from_uniqueID(fontID);
     SkStream* stream = tf ? tf->openStream() : NULL;
@@ -581,10 +573,12 @@
     }
 
     bool isFixedWidth;
-    SkString name;
-    SkTypeface::Style style = find_name_and_attributes(stream, &name, &isFixedWidth);
-    
-    return SkNEW_ARGS(StreamTypeface, (style, false, NULL, stream, isFixedWidth));
+    SkTypeface::Style style;
+    if (find_name_and_attributes(stream, NULL, &style, &isFixedWidth)) {
+        return SkNEW_ARGS(StreamTypeface, (style, false, NULL, stream, isFixedWidth));
+    } else {
+        return NULL;
+    }
 }
 
 SkTypeface* SkFontHost::CreateTypefaceFromFile(const char path[]) {
diff --git a/src/ports/SkFontHost_mac_atsui.cpp b/src/ports/SkFontHost_mac_atsui.cpp
index b90757c..ae32036 100644
--- a/src/ports/SkFontHost_mac_atsui.cpp
+++ b/src/ports/SkFontHost_mac_atsui.cpp
@@ -16,7 +16,7 @@
 #include "SkPoint.h"
 
 const char* gDefaultfont = "Arial"; // hard code for now
-static SkMutex      gFTMutex;
+SK_DECLARE_STATIC_MUTEX(gFTMutex);
 
 static inline SkPoint F32PtToSkPoint(const Float32Point p) {
     SkPoint sp = { SkFloatToScalar(p.x), SkFloatToScalar(p.y) };
diff --git a/src/ports/SkFontHost_mac_coretext.cpp b/src/ports/SkFontHost_mac_coretext.cpp
index 866e196..a612555 100644
--- a/src/ports/SkFontHost_mac_coretext.cpp
+++ b/src/ports/SkFontHost_mac_coretext.cpp
@@ -18,6 +18,7 @@
 #endif
 
 #include "SkFontHost.h"
+#include "SkCGUtils.h"
 #include "SkDescriptor.h"
 #include "SkEndian.h"
 #include "SkFloatingPoint.h"
@@ -31,6 +32,21 @@
 
 class SkScalerContext_Mac;
 
+static void CFSafeRelease(CFTypeRef obj) {
+    if (obj) {
+        CFRelease(obj);
+    }
+}
+
+class AutoCFRelease : SkNoncopyable {
+public:
+    AutoCFRelease(CFTypeRef obj) : fObj(obj) {}
+    ~AutoCFRelease() { CFSafeRelease(fObj); }
+    
+private:
+    CFTypeRef fObj;
+};
+
 // inline versions of these rect helpers
 
 static bool CGRectIsEmpty_inline(const CGRect& rect) {
@@ -132,7 +148,7 @@
 typedef uint32_t CGRGBPixel;
 
 static unsigned CGRGBPixel_getAlpha(CGRGBPixel pixel) {
-    return pixel >> 24;
+    return pixel & 0xFF;
 }
 
 // The calls to support subpixel are present in 10.5, but are not included in
@@ -247,23 +263,6 @@
     return SkScalarInvert(SkIntToScalar(unitsPerEm));
 }
 
-//============================================================================
-//      Macros
-//----------------------------------------------------------------------------
-// Release a CFTypeRef
-#ifndef CFSafeRelease
-#define CFSafeRelease(_object)                                      \
-    do                                                              \
-        {                                                           \
-        if ((_object) != NULL)                                      \
-            {                                                       \
-            CFRelease((CFTypeRef) (_object));                       \
-            (_object) = NULL;                                       \
-            }                                                       \
-        }                                                           \
-    while (false)
-#endif
-
 ///////////////////////////////////////////////////////////////////////////////
 
 #define BITMAP_INFO_RGB     (kCGImageAlphaNoneSkipFirst | kCGBitmapByteOrder32Host)
@@ -450,7 +449,7 @@
 }
 
 static SkTypeface* GetDefaultFace() {
-    static SkMutex gMutex;
+    SK_DECLARE_STATIC_MUTEX(gMutex);
     SkAutoMutexAcquire ma(gMutex);
 
     static SkTypeface* gDefaultFace;
@@ -464,6 +463,12 @@
 
 ///////////////////////////////////////////////////////////////////////////////
 
+extern CTFontRef SkTypeface_GetCTFontRef(const SkTypeface* face);
+CTFontRef SkTypeface_GetCTFontRef(const SkTypeface* face) {
+    const SkTypeface_Mac* macface = (const SkTypeface_Mac*)face;
+    return macface ? macface->fFontRef : NULL;
+}
+
 /*  This function is visible on the outside. It first searches the cache, and if
  *  not found, returns a new entry (after adding it to the cache).
  */
@@ -535,11 +540,9 @@
     }
 
     NameStyleRec rec = { familyName, style };
-    SkTypeface* face = SkTypefaceCache::FindByProc(FindByNameStyle, &rec);
+    SkTypeface* face = SkTypefaceCache::FindByProcAndRef(FindByNameStyle, &rec);
 
-    if (face) {
-        face->ref();
-    } else {
+    if (NULL == face) {
         face = NewFromName(familyName, style);
         if (face) {
             SkTypefaceCache::Add(face, style);
@@ -593,7 +596,12 @@
     SkMatrix                            fVerticalMatrix; // unit rotated
     SkMatrix                            fMatrix; // with font size
     SkMatrix                            fAdjustBadMatrix; // lion-specific fix
+#ifdef SK_USE_COLOR_LUMINANCE
+    Offscreen                           fBlackScreen;
+    Offscreen                           fWhiteScreen;
+#else
     Offscreen                           fOffscreen;
+#endif
     CTFontRef                           fCTFont;
     CTFontRef                           fCTVerticalFont; // for vertical advance
     CGFontRef                           fCGFont;
@@ -1052,7 +1060,7 @@
     for (int i = 0; i < 256; i++) {
         float x = i / 255.f;
         x = powf(x, ee);
-        int xx = SkScalarRound(SkFloatToScalar(x * 255));
+        int xx = SkScalarRoundToInt(SkFloatToScalar(x * 255));
         table[i] = SkToU8(xx);
     }
 }
@@ -1069,6 +1077,29 @@
     return isWhite ? gWhiteTable : gTable;
 }
 
+static const uint8_t* getGammaTable(U8CPU luminance) {
+    static uint8_t gGammaTables[4][256];
+    static bool gInited;
+    if (!gInited) {
+#if 1
+        float start = 1.1;
+        float stop = 2.1;
+        for (int i = 0; i < 4; ++i) {
+            float g = start + (stop - start) * i / 3;
+            build_power_table(gGammaTables[i], 1/g);
+        }
+#else
+        build_power_table(gGammaTables[0], 1);
+        build_power_table(gGammaTables[1], 1);
+        build_power_table(gGammaTables[2], 1);
+        build_power_table(gGammaTables[3], 1);
+#endif
+        gInited = true;
+    }
+    SkASSERT(0 == (luminance >> 8));
+    return gGammaTables[luminance >> 6];
+}
+
 static void invertGammaMask(bool isWhite, CGRGBPixel rgb[], int width,
                             int height, size_t rb) {
     const uint8_t* table = getInverseTable(isWhite);
@@ -1097,6 +1128,49 @@
     }
 }
 
+static int lerpScale(int dst, int src, int scale) {
+    return dst + (scale * (src - dst) >> 23);
+}
+
+static CGRGBPixel lerpPixel(CGRGBPixel dst, CGRGBPixel src,
+                            int scaleR, int scaleG, int scaleB) {
+    int sr = (src >> 16) & 0xFF;
+    int sg = (src >>  8) & 0xFF;
+    int sb = (src >>  0) & 0xFF;
+    int dr = (dst >> 16) & 0xFF;
+    int dg = (dst >>  8) & 0xFF;
+    int db = (dst >>  0) & 0xFF;
+
+    int rr = lerpScale(dr, sr, scaleR);
+    int rg = lerpScale(dg, sg, scaleG);
+    int rb = lerpScale(db, sb, scaleB);
+    return (rr << 16) | (rg << 8) | rb;
+}
+
+static void lerpPixels(CGRGBPixel dst[], const CGRGBPixel src[], int width,
+                       int height, int rowBytes, int lumBits) {
+#ifdef SK_USE_COLOR_LUMINANCE
+    int scaleR = (1 << 23) * SkColorGetR(lumBits) / 0xFF;
+    int scaleG = (1 << 23) * SkColorGetG(lumBits) / 0xFF;
+    int scaleB = (1 << 23) * SkColorGetB(lumBits) / 0xFF;
+#else
+    int scale = (1 << 23) * lumBits / SkScalerContext::kLuminance_Max;
+    int scaleR = scale;
+    int scaleG = scale;
+    int scaleB = scale;
+#endif
+
+    for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; ++x) {
+            // bit-not the src, since it was drawn from black, so we need the
+            // compliment of those bits
+            dst[x] = lerpPixel(dst[x], ~src[x], scaleR, scaleG, scaleB);
+        }
+        src = (CGRGBPixel*)((char*)src + rowBytes);
+        dst = (CGRGBPixel*)((char*)dst + rowBytes);
+    }
+}
+
 #if 1
 static inline int r32_to_16(int x) { return SkR32ToR16(x); }
 static inline int g32_to_16(int x) { return SkG32ToG16(x); }
@@ -1141,6 +1215,19 @@
 void SkScalerContext_Mac::generateImage(const SkGlyph& glyph) {
     CGGlyph cgGlyph = (CGGlyph) glyph.getGlyphID(fBaseGlyphCount);
 
+    const bool isLCD = isLCDFormat(glyph.fMaskFormat);
+    const bool isBW = SkMask::kBW_Format == glyph.fMaskFormat;
+    const bool isA8 = !isLCD && !isBW;
+    
+#ifdef SK_USE_COLOR_LUMINANCE
+    unsigned lumBits = fRec.getLuminanceColor();
+    uint32_t xorMask = 0;
+
+    if (isA8) {
+        // for A8, we just want a component (they're all the same)
+        lumBits = SkColorGetR(lumBits);
+    }
+#else
     bool fgColorIsWhite = true;
     bool isWhite = fRec.getLuminanceByte() >= WHITE_LUMINANCE_LIMIT;
     bool isBlack = fRec.getLuminanceByte() <= BLACK_LUMINANCE_LIMIT;
@@ -1152,7 +1239,7 @@
      *  extract the r,g,b values, invert-them, and now we have the original
      *  src mask components, which we pack into our 16bit mask.
      */
-    if (isLCDFormat(glyph.fMaskFormat)) {
+    if (isLCD) {
         if (isBlack) {
             xorMask = ~0;
             fgColorIsWhite = false;
@@ -1161,18 +1248,61 @@
             invertGamma = true;
         }
     }
+#endif
 
     size_t cgRowBytes;
+#ifdef SK_USE_COLOR_LUMINANCE
+    CGRGBPixel* cgPixels;
+    const uint8_t* gammaTable = NULL;
+    
+    if (isLCD) {
+        CGRGBPixel* wtPixels = NULL;
+        CGRGBPixel* bkPixels = NULL;
+        bool needBlack = true;
+        bool needWhite = true;
+
+        if (SK_ColorWHITE == lumBits) {
+            needBlack = false;
+        } else if (SK_ColorBLACK == lumBits) {
+            needWhite = false;
+        }
+        
+        if (needBlack) {
+            bkPixels = fBlackScreen.getCG(*this, glyph, false, cgGlyph, &cgRowBytes);
+            cgPixels = bkPixels;
+            xorMask = ~0;
+        }
+        if (needWhite) {
+            wtPixels = fWhiteScreen.getCG(*this, glyph, true, cgGlyph, &cgRowBytes);
+            cgPixels = wtPixels;
+            xorMask = 0;
+        }
+
+        if (wtPixels && bkPixels) {
+            lerpPixels(wtPixels, bkPixels, glyph.fWidth, glyph.fHeight, cgRowBytes,
+                       ~lumBits);
+        }
+    } else {    // isA8 or isBW
+        cgPixels = fWhiteScreen.getCG(*this, glyph, true, cgGlyph, &cgRowBytes);
+        if (isA8) {
+            gammaTable = getGammaTable(lumBits);
+        }
+    }
+#else
     CGRGBPixel* cgPixels = fOffscreen.getCG(*this, glyph, fgColorIsWhite, cgGlyph,
                                             &cgRowBytes);
+#endif
 
     // Draw the glyph
     if (cgPixels != NULL) {
 
+#ifdef SK_USE_COLOR_LUMINANCE
+#else
         if (invertGamma) {
             invertGammaMask(isWhite, (uint32_t*)cgPixels,
                             glyph.fWidth, glyph.fHeight, cgRowBytes);
         }
+#endif
 
         int width = glyph.fWidth;
         switch (glyph.fMaskFormat) {
@@ -1204,7 +1334,11 @@
                 size_t dstRB = glyph.rowBytes();
                 for (int y = 0; y < glyph.fHeight; y++) {
                     for (int i = 0; i < width; ++i) {
-                        dst[i] = CGRGBPixel_getAlpha(cgPixels[i]);
+                        unsigned alpha8 = CGRGBPixel_getAlpha(cgPixels[i]);
+#ifdef SK_USE_COLOR_LUMINANCE
+                        alpha8 = gammaTable[alpha8];
+#endif
+                        dst[i] = alpha8;
                     }
                     cgPixels = (CGRGBPixel*)((char*)cgPixels + cgRowBytes);
                     dst += dstRB;
@@ -1286,6 +1420,11 @@
         // balance the call to CTFontCreateCopyWithAttributes
         CFRelease(font);
     }
+    if (fRec.fFlags & SkScalerContext::kVertical_Flag) {
+        SkIPoint offset;
+        getVerticalOffset(cgGlyph, &offset);
+        path->offset(SkIntToScalar(offset.fX), SkIntToScalar(offset.fY));
+    }
 }
 
 void SkScalerContext_Mac::generateFontMetrics(SkPaint::FontMetrics* mx,
@@ -1349,16 +1488,64 @@
 
 ///////////////////////////////////////////////////////////////////////////////
 
-SkTypeface* SkFontHost::CreateTypefaceFromStream(SkStream* stream)
-{
-//    SkDEBUGFAIL("SkFontHost::CreateTypefaceFromStream unimplemented");
-    return SkFontHost::CreateTypeface(NULL, NULL, NULL, NULL, SkTypeface::kNormal);
+// Returns NULL on failure
+// Call must still manage its ownership of provider
+static SkTypeface* create_from_dataProvider(CGDataProviderRef provider) {
+    CGFontRef cg = CGFontCreateWithDataProvider(provider);
+    if (NULL == cg) {
+        return NULL;
+    }
+    CTFontRef ct = CTFontCreateWithGraphicsFont(cg, 0, NULL, NULL);
+    CGFontRelease(cg);
+    return cg ? SkCreateTypefaceFromCTFont(ct) : NULL;
 }
 
-SkTypeface* SkFontHost::CreateTypefaceFromFile(const char path[])
-{
-//    SkDEBUGFAIL("SkFontHost::CreateTypefaceFromFile unimplemented");
-    return SkFontHost::CreateTypeface(NULL, NULL, NULL, NULL, SkTypeface::kNormal);
+class AutoCGDataProviderRelease : SkNoncopyable {
+public:
+    AutoCGDataProviderRelease(CGDataProviderRef provider) : fProvider(provider) {}
+    ~AutoCGDataProviderRelease() { CGDataProviderRelease(fProvider); }
+    
+private:
+    CGDataProviderRef fProvider;
+};
+
+SkTypeface* SkFontHost::CreateTypefaceFromStream(SkStream* stream) {
+    CGDataProviderRef provider = SkCreateDataProviderFromStream(stream);
+    if (NULL == provider) {
+        return NULL;
+    }
+    AutoCGDataProviderRelease ar(provider);
+    return create_from_dataProvider(provider);
+}
+
+SkTypeface* SkFontHost::CreateTypefaceFromFile(const char path[]) {
+    CGDataProviderRef provider = CGDataProviderCreateWithFilename(path);
+    if (NULL == provider) {
+        return NULL;
+    }
+    AutoCGDataProviderRelease ar(provider);
+    return create_from_dataProvider(provider);
+}
+
+// Web fonts added to the the CTFont registry do not return their character set.
+// Iterate through the font in this case. The existing caller caches the result,
+// so the performance impact isn't too bad.
+static void populate_glyph_to_unicode_slow(CTFontRef ctFont,
+        unsigned glyphCount, SkTDArray<SkUnichar>* glyphToUnicode) {
+    glyphToUnicode->setCount(glyphCount);
+    SkUnichar* out = glyphToUnicode->begin();
+    sk_bzero(out, glyphCount * sizeof(SkUnichar));
+    UniChar unichar = 0;
+    while (glyphCount > 0) {
+        CGGlyph glyph;
+        if (CTFontGetGlyphsForCharacters(ctFont, &unichar, &glyph, 1)) {
+            out[glyph] = unichar;
+            --glyphCount;
+        }
+        if (++unichar == 0) {
+            break;
+        }
+    }
 }
 
 // Construct Glyph to Unicode table.
@@ -1368,6 +1555,7 @@
         const unsigned glyphCount, SkTDArray<SkUnichar>* glyphToUnicode) {
     CFCharacterSetRef charSet = CTFontCopyCharacterSet(ctFont);
     if (!charSet) {
+        populate_glyph_to_unicode_slow(ctFont, glyphCount, glyphToUnicode);
         return;
     }
     CFDataRef bitmap = CFCharacterSetCreateBitmapRepresentation(
@@ -1447,10 +1635,25 @@
         populate_glyph_to_unicode(ctFont, glyphCount, &info->fGlyphToUnicode);
     }
 
-    // TODO: get font type, ala:
-    //  CFTypeRef attr = CTFontCopyAttribute(ctFont, kCTFontFormatAttribute);
-    info->fType = SkAdvancedTypefaceMetrics::kTrueType_Font;
     info->fStyle = 0;
+
+    // If it's not a truetype font, mark it as 'other'. Assume that TrueType
+    // fonts always have both glyf and loca tables. At the least, this is what
+    // sfntly needs to subset the font. CTFontCopyAttribute() does not always
+    // succeed in determining this directly. 
+    if (!GetTableSize(fontID, 'glyf') || !GetTableSize(fontID, 'loca')) {
+        info->fType = SkAdvancedTypefaceMetrics::kOther_Font;
+        info->fItalicAngle = 0;
+        info->fAscent = 0;
+        info->fDescent = 0;
+        info->fStemV = 0;
+        info->fCapHeight = 0;
+        info->fBBox = SkIRect::MakeEmpty();
+        CFSafeRelease(ctFont);
+        return info;
+    }
+
+    info->fType = SkAdvancedTypefaceMetrics::kTrueType_Font;
     CTFontSymbolicTraits symbolicTraits = CTFontGetSymbolicTraits(ctFont);
     if (symbolicTraits & kCTFontMonoSpaceTrait) {
         info->fStyle |= SkAdvancedTypefaceMetrics::kFixedPitch_Style;
@@ -1523,10 +1726,6 @@
 
 ///////////////////////////////////////////////////////////////////////////////
 
-bool SkFontHost::ValidFontID(SkFontID fontID) {
-    return SkTypefaceCache::FindByID(fontID) != NULL;
-}
-
 struct FontHeader {
     SkFixed fVersion;
     uint16_t fNumTables;
@@ -1689,9 +1888,16 @@
     }
     rec->setHinting(h);
 
-    // for compatibility at the moment, discretize luminance to 3 settings
-    // black, white, gray. This helps with fontcache utilization, since we
-    // won't create multiple entries that in the end map to the same results.
+#ifdef SK_USE_COLOR_LUMINANCE
+    if (isLCDFormat(rec->fMaskFormat)) {
+        SkColor c = rec->getLuminanceColor();
+        // apply our chosen scaling between Black and White cg output
+        int r = SkColorGetR(c)*2/3;
+        int g = SkColorGetG(c)*2/3;
+        int b = SkColorGetB(c)*2/3;
+        rec->setLuminanceColor(SkColorSetRGB(r, g, b));
+    }
+#else
     {
         unsigned lum = rec->getLuminanceByte();
         if (lum <= BLACK_LUMINANCE_LIMIT) {
@@ -1703,7 +1909,8 @@
         }
         rec->setLuminanceBits(lum);
     }
-    
+#endif
+
     if (SkMask::kLCD16_Format == rec->fMaskFormat
             || SkMask::kLCD32_Format == rec->fMaskFormat) {
         if (supports_LCD()) {
@@ -1717,97 +1924,82 @@
 ///////////////////////////////////////////////////////////////////////////
 
 int SkFontHost::CountTables(SkFontID fontID) {
-    int             numTables;
-    CFArrayRef      cfArray;
-    CTFontRef       ctFont;
-
-
-    // Get the state we need
-    ctFont    = GetFontRefFromFontID(fontID);
-    cfArray   = CTFontCopyAvailableTables(ctFont, kCTFontTableOptionNoOptions);
-    numTables = 0;
-
-
-    // Get the table count
-    if (cfArray != NULL)
-        {
-        numTables = CFArrayGetCount(cfArray);
-        CFSafeRelease(cfArray);
-        }
-
-    return(numTables);
+    CTFontRef ctFont = GetFontRefFromFontID(fontID);
+    CFArrayRef cfArray = CTFontCopyAvailableTables(ctFont,
+                                                   kCTFontTableOptionNoOptions);
+    if (NULL == cfArray) {
+        return 0;
+    }
+    
+    AutoCFRelease ar(cfArray);
+    return CFArrayGetCount(cfArray);
 }
 
-int SkFontHost::GetTableTags(SkFontID fontID, SkFontTableTag tags[])
-{   int             n, numTables;
-    CFArrayRef      cfArray;
-    CTFontRef       ctFont;
+int SkFontHost::GetTableTags(SkFontID fontID, SkFontTableTag tags[]) {
+    CTFontRef ctFont = GetFontRefFromFontID(fontID);
+    CFArrayRef cfArray = CTFontCopyAvailableTables(ctFont,
+                                                   kCTFontTableOptionNoOptions);
+    if (NULL == cfArray) {
+        return 0;
+    }
 
+    AutoCFRelease ar(cfArray);
 
-    // Get the state we need
-    ctFont    = GetFontRefFromFontID(fontID);
-    cfArray   = CTFontCopyAvailableTables(ctFont, kCTFontTableOptionNoOptions);
-    numTables = 0;
-
-
-    // Get the table tags
-    if (cfArray != NULL)
-        {
-        numTables = CFArrayGetCount(cfArray);
-        for (n = 0; n < numTables; n++)
-            tags[n] = (SkFontTableTag) ((uintptr_t) CFArrayGetValueAtIndex(cfArray, n));
-
-        CFSafeRelease(cfArray);
+    int count = CFArrayGetCount(cfArray);
+    if (tags) {
+        for (int i = 0; i < count; ++i) {
+            tags[i] = (SkFontTableTag)CFArrayGetValueAtIndex(cfArray, i);
         }
-
-    return(numTables);
+    }
+    return count;
 }
 
-size_t SkFontHost::GetTableSize(SkFontID fontID, SkFontTableTag tag)
-{   size_t      theSize;
-    CTFontRef   ctFont;
-    CFDataRef   cfData;
+// If, as is the case with web fonts, the CTFont data isn't available,
+// the CGFont data may work. While the CGFont may always provide the
+// right result, leave the CTFont code path to minimize disruption.
+static CFDataRef copyTableFromFont(CTFontRef ctFont, SkFontTableTag tag) {
+    CFDataRef data = CTFontCopyTable(ctFont, (CTFontTableTag) tag,
+                                     kCTFontTableOptionNoOptions);
+    if (NULL == data) {
+        CGFontRef cgFont = CTFontCopyGraphicsFont(ctFont, NULL);
+        data = CGFontCopyTableForTag(cgFont, tag);
+        CGFontRelease(cgFont);
+    }
+    return data;
+}
 
+size_t SkFontHost::GetTableSize(SkFontID fontID, SkFontTableTag tag) {
+    CTFontRef ctFont = GetFontRefFromFontID(fontID);
+    CFDataRef srcData = copyTableFromFont(ctFont, tag);
+    if (NULL == srcData) {
+        return 0;
+    }
 
-    // Get the state we need
-    ctFont  = GetFontRefFromFontID(fontID);
-    cfData  = CTFontCopyTable(ctFont, (CTFontTableTag) tag, kCTFontTableOptionNoOptions);
-    theSize = 0;
-
-
-    // Get the data size
-    if (cfData != NULL)
-        {
-        theSize = CFDataGetLength(cfData);
-        CFSafeRelease(cfData);
-        }
-
-    return(theSize);
+    AutoCFRelease ar(srcData);
+    return CFDataGetLength(srcData);
 }
 
 size_t SkFontHost::GetTableData(SkFontID fontID, SkFontTableTag tag,
-                                size_t offset, size_t length, void* data)
-{   size_t          theSize;
-    CTFontRef       ctFont;
-    CFDataRef       cfData;
-
-
-    // Get the state we need
-    ctFont  = GetFontRefFromFontID(fontID);
-    cfData  = CTFontCopyTable(ctFont, (CTFontTableTag) tag, kCTFontTableOptionNoOptions);
-    theSize = 0;
-
-
-    // Get the data
-    if (cfData != NULL)
-        theSize = CFDataGetLength(cfData);
-
-    if (offset >= theSize)
+                                size_t offset, size_t length, void* dst) {
+    CTFontRef ctFont = GetFontRefFromFontID(fontID);
+    CFDataRef srcData = copyTableFromFont(ctFont, tag);
+    if (NULL == srcData) {
         return 0;
+    }
 
-    if ((offset + length) > theSize)
-        length = theSize - offset;
+    AutoCFRelease ar(srcData);
 
-    memcpy(data, CFDataGetBytePtr(cfData) + offset, length);
-    return(length);
+    size_t srcSize = CFDataGetLength(srcData);
+    if (offset >= srcSize) {
+        return 0;
+    }
+
+    if ((offset + length) > srcSize) {
+        length = srcSize - offset;
+    }
+
+    if (dst) {
+        memcpy(dst, CFDataGetBytePtr(srcData) + offset, length);
+    }
+    return length;
 }
diff --git a/src/ports/SkFontHost_none.cpp b/src/ports/SkFontHost_none.cpp
index 99df213..e79926d 100644
--- a/src/ports/SkFontHost_none.cpp
+++ b/src/ports/SkFontHost_none.cpp
@@ -40,11 +40,6 @@
 
 ///////////////////////////////////////////////////////////////////////////////
 
-bool SkFontHost::ValidFontID(uint32_t uniqueID) {
-    SkDEBUGFAIL("SkFontHost::ResolveTypeface unimplemented");
-    return false;
-}
-
 SkStream* SkFontHost::OpenStream(uint32_t uniqueID) {
     SkDEBUGFAIL("SkFontHost::OpenStream unimplemented");
     return NULL;
diff --git a/src/ports/SkFontHost_simple.cpp b/src/ports/SkFontHost_simple.cpp
index 0624d35..eef4dd7 100644
--- a/src/ports/SkFontHost_simple.cpp
+++ b/src/ports/SkFontHost_simple.cpp
@@ -23,8 +23,8 @@
     #define SK_FONT_FILE_PREFIX          "/skimages/"
 #endif
 
-SkTypeface::Style find_name_and_attributes(SkStream* stream, SkString* name,
-                                           bool* isFixedWidth);
+bool find_name_and_attributes(SkStream* stream, SkString* name,
+                              SkTypeface::Style* style, bool* isFixedWidth);
 
 static void GetFullPathForSysFonts(SkString* full, const char name[]) {
     full->set(SK_FONT_FILE_PREFIX);
@@ -59,7 +59,7 @@
 static int32_t gUniqueFontID;
 
 // this is the mutex that protects these globals
-static SkMutex gFamilyMutex;
+SK_DECLARE_STATIC_MUTEX(gFamilyMutex);
 static FamilyRec* gFamilyHead;
 static SkTDArray<NameFamilyPair> gNameList;
 
@@ -350,20 +350,17 @@
 
 static bool get_name_and_style(const char path[], SkString* name,
                                SkTypeface::Style* style, bool isExpected) {
-    bool            isFixedWidth;
     SkString        fullpath;
     GetFullPathForSysFonts(&fullpath, path);
 
     SkMMAPStream stream(fullpath.c_str());
     if (stream.getLength() > 0) {
-        *style = find_name_and_attributes(&stream, name, &isFixedWidth);
-        return true;
+        return find_name_and_attributes(&stream, name, style, NULL);
     }
     else {
         SkFILEStream stream(fullpath.c_str());
         if (stream.getLength() > 0) {
-            *style = find_name_and_attributes(&stream, name, &isFixedWidth);
-            return true;
+            return find_name_and_attributes(&stream, name, style, NULL);
         }
     }
 
@@ -562,12 +559,6 @@
     return tf;
 }
 
-bool SkFontHost::ValidFontID(uint32_t fontID) {
-    SkAutoMutexAcquire  ac(gFamilyMutex);
-
-    return find_from_uniqueID(fontID) != NULL;
-}
-
 SkStream* SkFontHost::OpenStream(uint32_t fontID) {
     SkAutoMutexAcquire  ac(gFamilyMutex);
 
@@ -635,12 +626,12 @@
         return NULL;
     }
 
-    bool     isFixedWidth;
-    SkString name;
-    SkTypeface::Style style = find_name_and_attributes(stream, &name,
-                                                       &isFixedWidth);
-
-    return SkNEW_ARGS(StreamTypeface, (style, false, NULL, stream));
+    SkTypeface::Style style;
+    if (find_name_and_attributes(stream, NULL, &style, NULL)) {
+        return SkNEW_ARGS(StreamTypeface, (style, false, NULL, stream));
+    } else {
+        return NULL;
+    }
 }
 
 SkTypeface* SkFontHost::CreateTypefaceFromFile(const char path[]) {
diff --git a/src/ports/SkFontHost_win.cpp b/src/ports/SkFontHost_win.cpp
index 9d98bcb..ec7e1b0 100755
--- a/src/ports/SkFontHost_win.cpp
+++ b/src/ports/SkFontHost_win.cpp
@@ -175,10 +175,8 @@
 SkTypeface* SkCreateTypefaceFromLOGFONT(const LOGFONT& origLF) {
     LOGFONT lf = origLF;
     make_canonical(&lf);
-    SkTypeface* face = SkTypefaceCache::FindByProc(FindByLogFont, &lf);
-    if (face) {
-        face->ref();
-    } else {
+    SkTypeface* face = SkTypefaceCache::FindByProcAndRef(FindByLogFont, &lf);
+    if (NULL == face) {
         face = LogFontTypeface::Create(lf);
         SkTypefaceCache::Add(face, get_style(lf));
     }
@@ -205,7 +203,9 @@
 
 static void ensure_typeface_accessible(SkFontID fontID) {
     LogFontTypeface* face = (LogFontTypeface*)SkTypefaceCache::FindByID(fontID);
-    SkFontHost::EnsureTypefaceAccessible(*face);
+    if (face) {
+        SkFontHost::EnsureTypefaceAccessible(*face);
+    }
 }
 
 static void GetLogFontByID(SkFontID fontID, LOGFONT* lf) {
@@ -448,7 +448,7 @@
     return SkFixedToFIXED(SkFloatToFixed(x));
 }
 
-static SkMutex gFTMutex;
+SK_DECLARE_STATIC_MUTEX(gFTMutex);
 
 #define HIRES_TEXTSIZE  2048
 #define HIRES_SHIFT     11
@@ -468,7 +468,11 @@
         case SkMask::kLCD32_Format:
             return CLEARTYPE_QUALITY;
         default:
-            return ANTIALIASED_QUALITY;
+            if (rec.fFlags & SkScalerContext::kGenA8FromLCD_Flag) {
+                return CLEARTYPE_QUALITY;
+            } else {
+                return ANTIALIASED_QUALITY;
+            }
     }
 }
 
@@ -713,12 +717,16 @@
 // gdi's bitmap is upside-down, so we reverse dst walking in Y
 // whenever we copy it into skia's buffer
 
+static int compute_luminance(int r, int g, int b) {
+//    return (r * 2 + g * 5 + b) >> 3;
+    return (r * 27 + g * 92 + b * 9) >> 7;
+}
+
 static inline uint8_t rgb_to_a8(SkGdiRGB rgb) {
     int r = (rgb >> 16) & 0xFF;
     int g = (rgb >>  8) & 0xFF;
     int b = (rgb >>  0) & 0xFF;
-
-    return (r * 2 + g * 5 + b) >> 3;  // luminance
+    return compute_luminance(r, g, b);
 }
 
 static inline uint16_t rgb_to_lcd16(SkGdiRGB rgb) {
diff --git a/src/ports/SkImageDecoder_CG.cpp b/src/ports/SkImageDecoder_CG.cpp
index 81c6b37..4e2bcc9 100644
--- a/src/ports/SkImageDecoder_CG.cpp
+++ b/src/ports/SkImageDecoder_CG.cpp
@@ -171,10 +171,9 @@
         return false;
     }
     SkAutoTCallVProc<CGImage, CGImageRelease> agimage(image);
-    
-	CGImageDestinationAddImage(dst, image, NULL);
-	CGImageDestinationFinalize(dst);
-    return true;
+
+    CGImageDestinationAddImage(dst, image, NULL);
+    return CGImageDestinationFinalize(dst);
 }
 
 SkImageEncoder* SkImageEncoder::Create(Type t) {
diff --git a/src/ports/SkThread_none.cpp b/src/ports/SkThread_none.cpp
index e70acde..8361021 100644
--- a/src/ports/SkThread_none.cpp
+++ b/src/ports/SkThread_none.cpp
@@ -9,33 +9,23 @@
 
 #include "SkThread.h"
 
-int32_t sk_atomic_inc(int32_t* addr)
-{
+int32_t sk_atomic_inc(int32_t* addr) {
     int32_t value = *addr;
     *addr = value + 1;
     return value;
 }
 
-int32_t sk_atomic_dec(int32_t* addr)
-{
+int32_t sk_atomic_dec(int32_t* addr) {
     int32_t value = *addr;
     *addr = value - 1;
     return value;
 }
 
-SkMutex::SkMutex(bool /* isGlobal */)
-{
-}
+SkMutex::SkMutex() {}
 
-SkMutex::~SkMutex()
-{
-}
+SkMutex::~SkMutex() {}
 
-void SkMutex::acquire()
-{
-}
+void SkMutex::acquire() {}
 
-void SkMutex::release()
-{
-}
+void SkMutex::release() {}
 
diff --git a/src/ports/SkThread_pthread.cpp b/src/ports/SkThread_pthread.cpp
index 51c0859..4750d4f 100644
--- a/src/ports/SkThread_pthread.cpp
+++ b/src/ports/SkThread_pthread.cpp
@@ -55,7 +55,7 @@
 int32_t sk_atomic_dec(int32_t* addr)
 {
     SkAutoMutexAcquire ac(gAtomicMutex);
-    
+
     int32_t value = *addr;
     *addr = value - 1;
     return value;
@@ -67,27 +67,48 @@
 
 //////////////////////////////////////////////////////////////////////////////
 
-static void print_pthread_error(int status)
-{
+static void print_pthread_error(int status) {
     switch (status) {
     case 0: // success
         break;
     case EINVAL:
-        printf("pthread error [%d] EINVAL\n", status);
+        SkDebugf("pthread error [%d] EINVAL\n", status);
         break;
     case EBUSY:
-        printf("pthread error [%d] EBUSY\n", status);
+        SkDebugf("pthread error [%d] EBUSY\n", status);
         break;
     default:
-        printf("pthread error [%d] unknown\n", status);
+        SkDebugf("pthread error [%d] unknown\n", status);
         break;
     }
 }
 
-SkMutex::SkMutex(bool isGlobal) : fIsGlobal(isGlobal)
-{
-    if (sizeof(pthread_mutex_t) > sizeof(fStorage))
-    {
+#ifdef SK_USE_POSIX_THREADS
+
+SkMutex::SkMutex() {
+    int status;
+
+    status = pthread_mutex_init(&fMutex, NULL);
+    if (status != 0) {
+        print_pthread_error(status);
+        SkASSERT(0 == status);
+    }
+}
+
+SkMutex::~SkMutex() {
+    int status = pthread_mutex_destroy(&fMutex);
+
+    // only report errors on non-global mutexes
+    if (status != 0) {
+        print_pthread_error(status);
+        SkASSERT(0 == status);
+    }
+}
+
+#else // !SK_USE_POSIX_THREADS
+
+SkMutex::SkMutex() {
+    if (sizeof(pthread_mutex_t) > sizeof(fStorage)) {
         SkDEBUGF(("pthread mutex size = %d\n", sizeof(pthread_mutex_t)));
         SkDEBUGFAIL("mutex storage is too small");
     }
@@ -104,29 +125,27 @@
     SkASSERT(0 == status);
 }
 
-SkMutex::~SkMutex()
-{
+SkMutex::~SkMutex() {
     int status = pthread_mutex_destroy((pthread_mutex_t*)fStorage);
-    
+#if 0
     // only report errors on non-global mutexes
-    if (!fIsGlobal)
-    {
+    if (!fIsGlobal) {
         print_pthread_error(status);
         SkASSERT(0 == status);
     }
+#endif
 }
 
-void SkMutex::acquire()
-{
+void SkMutex::acquire() {
     int status = pthread_mutex_lock((pthread_mutex_t*)fStorage);
     print_pthread_error(status);
     SkASSERT(0 == status);
 }
 
-void SkMutex::release()
-{
+void SkMutex::release() {
     int status = pthread_mutex_unlock((pthread_mutex_t*)fStorage);
     print_pthread_error(status);
     SkASSERT(0 == status);
 }
 
+#endif // !SK_USE_POSIX_THREADS
diff --git a/src/ports/SkThread_win.cpp b/src/ports/SkThread_win.cpp
index 5fa58dd..70b8e11 100644
--- a/src/ports/SkThread_win.cpp
+++ b/src/ports/SkThread_win.cpp
@@ -8,38 +8,39 @@
 
 
 #include <windows.h>
+#include <intrin.h>
 #include "SkThread.h"
 
-int32_t sk_atomic_inc(int32_t* addr)
-{
+//MSDN says in order to declare an interlocked function for use as an
+//intrinsic, include intrin.h and put the function in a #pragma intrinsic
+//directive.
+//The pragma appears to be unnecessary, but doesn't hurt.
+#pragma intrinsic(_InterlockedIncrement, _InterlockedDecrement)
+
+int32_t sk_atomic_inc(int32_t* addr) {
     // InterlockedIncrement returns the new value, we want to return the old.
-    return InterlockedIncrement(reinterpret_cast<LONG*>(addr)) - 1;
+    return _InterlockedIncrement(reinterpret_cast<LONG*>(addr)) - 1;
 }
 
-int32_t sk_atomic_dec(int32_t* addr)
-{
-    return InterlockedDecrement(reinterpret_cast<LONG*>(addr)) + 1;
+int32_t sk_atomic_dec(int32_t* addr) {
+    return _InterlockedDecrement(reinterpret_cast<LONG*>(addr)) + 1;
 }
 
-SkMutex::SkMutex(bool /* isGlobal */)
-{
+SkMutex::SkMutex() {
     SK_COMPILE_ASSERT(sizeof(fStorage) > sizeof(CRITICAL_SECTION),
                       NotEnoughSizeForCriticalSection);
     InitializeCriticalSection(reinterpret_cast<CRITICAL_SECTION*>(&fStorage));
 }
 
-SkMutex::~SkMutex()
-{
+SkMutex::~SkMutex() {
     DeleteCriticalSection(reinterpret_cast<CRITICAL_SECTION*>(&fStorage));
 }
 
-void SkMutex::acquire()
-{
+void SkMutex::acquire() {
     EnterCriticalSection(reinterpret_cast<CRITICAL_SECTION*>(&fStorage));
 }
 
-void SkMutex::release()
-{
+void SkMutex::release() {
     LeaveCriticalSection(reinterpret_cast<CRITICAL_SECTION*>(&fStorage));
 }
 
diff --git a/src/svg/SkSVGCircle.cpp b/src/svg/SkSVGCircle.cpp
index 2f282bd..e34e179 100644
--- a/src/svg/SkSVGCircle.cpp
+++ b/src/svg/SkSVGCircle.cpp
@@ -33,13 +33,13 @@
     right = cx + r;
     bottom = cy + r;
     char scratch[16];
-    sprintf(scratch, "%g", left);
+    sprintf(scratch, "%g", SkScalarToDouble(left));
     parser._addAttribute("left", scratch);
-    sprintf(scratch, "%g", top);
+    sprintf(scratch, "%g", SkScalarToDouble(top));
     parser._addAttribute("top", scratch);
-    sprintf(scratch, "%g", right);
+    sprintf(scratch, "%g", SkScalarToDouble(right));
     parser._addAttribute("right", scratch);
-    sprintf(scratch, "%g", bottom);
+    sprintf(scratch, "%g", SkScalarToDouble(bottom));
     parser._addAttribute("bottom", scratch);
     parser._endElement();
 }
diff --git a/src/svg/SkSVGEllipse.cpp b/src/svg/SkSVGEllipse.cpp
index e239565..281e4e9 100644
--- a/src/svg/SkSVGEllipse.cpp
+++ b/src/svg/SkSVGEllipse.cpp
@@ -35,13 +35,13 @@
     right = cx + rx;
     bottom = cy + ry;
     char scratch[16];
-    sprintf(scratch, "%g", left);
+    sprintf(scratch, "%g", SkScalarToDouble(left));
     parser._addAttribute("left", scratch);
-    sprintf(scratch, "%g", top);
+    sprintf(scratch, "%g", SkScalarToDouble(top));
     parser._addAttribute("top", scratch);
-    sprintf(scratch, "%g", right);
+    sprintf(scratch, "%g", SkScalarToDouble(right));
     parser._addAttribute("right", scratch);
-    sprintf(scratch, "%g", bottom);
+    sprintf(scratch, "%g", SkScalarToDouble(bottom));
     parser._addAttribute("bottom", scratch);
     parser._endElement();
 }
diff --git a/src/animator/SkBase64.cpp b/src/utils/SkBase64.cpp
similarity index 95%
rename from src/animator/SkBase64.cpp
rename to src/utils/SkBase64.cpp
index 076f649..a8d4e87 100644
--- a/src/animator/SkBase64.cpp
+++ b/src/utils/SkBase64.cpp
@@ -12,7 +12,7 @@
 #define DecodePad -2
 #define EncodePad 64
 
-static const char encode[] = 
+static const char default_encode[] = 
     "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
     "abcdefghijklmnopqrstuvwxyz"
     "0123456789+/=";
@@ -109,7 +109,13 @@
 #pragma warning ( pop )
 #endif
 
-size_t SkBase64::Encode(const void* srcPtr, size_t length, void* dstPtr) {
+size_t SkBase64::Encode(const void* srcPtr, size_t length, void* dstPtr, const char* encodeMap) {
+    const char* encode;
+    if (NULL == encodeMap) {
+        encode = default_encode;
+    } else {
+        encode = encodeMap;
+    }
     const unsigned char* src = (const unsigned char*) srcPtr;
     unsigned char* dst = (unsigned char*) dstPtr;
     if (dst) {
diff --git a/src/animator/SkBase64.h b/src/utils/SkBase64.h
similarity index 75%
rename from src/animator/SkBase64.h
rename to src/utils/SkBase64.h
index 69d256c..4f3b323 100644
--- a/src/animator/SkBase64.h
+++ b/src/utils/SkBase64.h
@@ -23,7 +23,12 @@
     SkBase64();
     Error decode(const char* src, size_t length);
     char* getData() { return fData; }
-    static size_t Encode(const void* src, size_t length, void* dest);
+    /**
+       Base64 encodes src into dst. encode is a pointer to at least 65 chars.
+       encode[64] will be used as the pad character. Encodings other than the
+       default encoding cannot be decoded.
+    */
+    static size_t Encode(const void* src, size_t length, void* dest, const char* encode = NULL);
 
 #ifdef SK_SUPPORT_UNITTEST
     static void UnitTest();
diff --git a/src/utils/SkDeferredCanvas.cpp b/src/utils/SkDeferredCanvas.cpp
new file mode 100644
index 0000000..e965050
--- /dev/null
+++ b/src/utils/SkDeferredCanvas.cpp
@@ -0,0 +1,595 @@
+
+/*
+ * Copyright 2011 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkDeferredCanvas.h"
+
+#include "SkPaint.h"
+#include "SkShader.h"
+#include "SkColorFilter.h"
+#include "SkDrawFilter.h"
+
+namespace {
+
+bool isPaintOpaque(const SkPaint* paint, 
+                   const SkBitmap* bmpReplacesShader = NULL) {
+    // TODO: SkXfermode should have a virtual isOpaque method, which would
+    // make it possible to test modes that do not have a Coeff representation.
+
+    if (!paint) {
+        return bmpReplacesShader ? bmpReplacesShader->isOpaque() : true;
+    }
+
+    SkXfermode::Coeff srcCoeff, dstCoeff;
+    if (SkXfermode::AsCoeff(paint->getXfermode(), &srcCoeff, &dstCoeff)){
+        switch (dstCoeff) {
+        case SkXfermode::kZero_Coeff:
+            return true;
+        case SkXfermode::kISA_Coeff:
+            if (paint->getAlpha() != 255) {
+                break;
+            }
+            if (bmpReplacesShader) {
+                if (!bmpReplacesShader->isOpaque()) {
+                    break;
+                }
+            } else if (paint->getShader() && !paint->getShader()->isOpaque()) {
+                break;
+            }
+            if (paint->getColorFilter() && 
+                ((paint->getColorFilter()->getFlags() &
+                SkColorFilter::kAlphaUnchanged_Flag) == 0)) {
+                break;
+            }
+            return true;
+        case SkXfermode::kSA_Coeff:
+            if (paint->getAlpha() != 0) {
+                break;
+            }
+            if (paint->getColorFilter() && 
+                ((paint->getColorFilter()->getFlags() &
+                SkColorFilter::kAlphaUnchanged_Flag) == 0)) {
+                break;
+            }
+            return true;
+        case SkXfermode::kSC_Coeff:
+            if (paint->getColor() != 0) { // all components must be 0
+                break;
+            }
+            if (bmpReplacesShader || paint->getShader()) {
+                break;
+            }
+            if (paint->getColorFilter() && (
+                (paint->getColorFilter()->getFlags() &
+                SkColorFilter::kAlphaUnchanged_Flag) == 0)) {
+                break;
+            }
+            return true;
+        default:
+            break;
+        }
+    }
+    return false;
+}
+
+} // unnamed namespace
+
+SkDeferredCanvas::SkDeferredCanvas() {
+    init();
+}
+
+SkDeferredCanvas::SkDeferredCanvas(SkDevice* device) {
+    init();
+    setDevice(device);
+}
+
+SkDeferredCanvas::SkDeferredCanvas(SkDevice* device, 
+                                   DeviceContext* deviceContext) {
+    init();
+    setDevice(device);
+    setDeviceContext(deviceContext);
+}
+
+void SkDeferredCanvas::init() {
+    fDeferredDrawing = true; // On by default
+}
+
+void SkDeferredCanvas::validate() const {
+    SkASSERT(getDevice());
+}
+
+SkCanvas* SkDeferredCanvas::drawingCanvas() const {
+    validate();
+    return fDeferredDrawing ? getDeferredDevice()->recordingCanvas() :
+        getDeferredDevice()->immediateCanvas();
+}
+
+void SkDeferredCanvas::flushIfNeeded(const SkBitmap& bitmap) {
+    validate();
+    if (fDeferredDrawing) {
+        getDeferredDevice()->flushIfNeeded(bitmap);
+    }
+}
+
+SkDeferredCanvas::DeferredDevice* SkDeferredCanvas::getDeferredDevice() const {
+    return static_cast<SkDeferredCanvas::DeferredDevice*>(getDevice());
+}
+
+void SkDeferredCanvas::setDeferredDrawing(bool val) {
+    validate(); // Must set device before calling this method
+    SkASSERT(drawingCanvas()->getSaveCount() == 1);
+    if (val != fDeferredDrawing) {
+        if (fDeferredDrawing) {
+            // Going live.
+            getDeferredDevice()->flushPending();
+        }
+        fDeferredDrawing = val;
+    }
+}
+
+SkDeferredCanvas::~SkDeferredCanvas() {
+}
+
+SkDevice* SkDeferredCanvas::setDevice(SkDevice* device) {
+    INHERITED::setDevice(SkNEW_ARGS(DeferredDevice, (device)))->unref();
+    return device;
+}
+
+SkDeferredCanvas::DeviceContext* SkDeferredCanvas::setDeviceContext(
+    DeviceContext* deviceContext) {
+
+    DeferredDevice* deferredDevice = getDeferredDevice();
+    SkASSERT(deferredDevice);
+    if (deferredDevice) {
+        deferredDevice->setDeviceContext(deviceContext);
+    }
+    return deviceContext;
+}
+
+bool SkDeferredCanvas::isFullFrame(const SkRect* rect,
+                                   const SkPaint* paint) const {
+    SkCanvas* canvas = drawingCanvas();
+    SkISize canvasSize = getDeviceSize();
+    if (rect) {
+        if (!canvas->getTotalMatrix().rectStaysRect()) {
+            return false; // conservative
+        }
+
+        SkRect transformedRect;
+        canvas->getTotalMatrix().mapRect(&transformedRect, *rect);
+
+        if (paint) {
+            SkPaint::Style paintStyle = paint->getStyle();
+            if (!(paintStyle == SkPaint::kFill_Style || 
+                paintStyle == SkPaint::kStrokeAndFill_Style)) {
+                return false;
+            }
+            if (paint->getMaskFilter() || paint->getLooper()
+                || paint->getPathEffect() || paint->getImageFilter()) {
+                return false; // conservative
+            }
+        }
+
+        // The following test holds with AA enabled, and is conservative
+        // by a 0.5 pixel margin with AA disabled
+        if (transformedRect.fLeft > SkIntToScalar(0) || 
+            transformedRect.fTop > SkIntToScalar(0) || 
+            transformedRect.fRight < SkIntToScalar(canvasSize.fWidth) ||
+            transformedRect.fBottom < SkIntToScalar(canvasSize.fHeight)) {
+            return false;
+        }
+    }
+
+    switch (canvas->getClipType()) {
+        case SkCanvas::kRect_ClipType :
+            {
+                SkIRect bounds;
+                canvas->getClipDeviceBounds(&bounds);
+                if (bounds.fLeft > 0 || bounds.fTop > 0 || 
+                    bounds.fRight < canvasSize.fWidth || 
+                    bounds.fBottom < canvasSize.fHeight)
+                    return false;
+            }
+            break;
+        case SkCanvas::kComplex_ClipType :
+            return false; // conservative
+        case SkCanvas::kEmpty_ClipType:
+        default:
+            break;
+    };
+
+    return true;
+}
+
+int SkDeferredCanvas::save(SaveFlags flags) {
+    drawingCanvas()->save(flags);
+    return this->INHERITED::save(flags);
+}
+
+int SkDeferredCanvas::saveLayer(const SkRect* bounds, const SkPaint* paint,
+                                SaveFlags flags) {
+    drawingCanvas()->saveLayer(bounds, paint, flags);
+    int count = this->INHERITED::save(flags);
+    this->clipRectBounds(bounds, flags, NULL);
+    return count;
+}
+
+void SkDeferredCanvas::restore() {
+    drawingCanvas()->restore();
+    this->INHERITED::restore();
+}
+
+bool SkDeferredCanvas::isDrawingToLayer() const {
+    return drawingCanvas()->isDrawingToLayer();
+}
+
+bool SkDeferredCanvas::translate(SkScalar dx, SkScalar dy) {
+    drawingCanvas()->translate(dx, dy);
+    return this->INHERITED::translate(dx, dy);
+}
+
+bool SkDeferredCanvas::scale(SkScalar sx, SkScalar sy) {
+    drawingCanvas()->scale(sx, sy);
+    return this->INHERITED::scale(sx, sy);
+}
+
+bool SkDeferredCanvas::rotate(SkScalar degrees) {
+    drawingCanvas()->rotate(degrees);
+    return this->INHERITED::rotate(degrees);
+}
+
+bool SkDeferredCanvas::skew(SkScalar sx, SkScalar sy) {
+    drawingCanvas()->skew(sx, sy);
+    return this->INHERITED::skew(sx, sy);
+}
+
+bool SkDeferredCanvas::concat(const SkMatrix& matrix) {
+    drawingCanvas()->concat(matrix);
+    return this->INHERITED::concat(matrix);
+}
+
+void SkDeferredCanvas::setMatrix(const SkMatrix& matrix) {
+    drawingCanvas()->setMatrix(matrix);
+    this->INHERITED::setMatrix(matrix);
+}
+
+bool SkDeferredCanvas::clipRect(const SkRect& rect,
+                                SkRegion::Op op,
+                                bool doAntiAlias) {
+    drawingCanvas()->clipRect(rect, op, doAntiAlias);
+    return this->INHERITED::clipRect(rect, op, doAntiAlias);
+}
+
+bool SkDeferredCanvas::clipPath(const SkPath& path,
+                                SkRegion::Op op,
+                                bool doAntiAlias) {
+    drawingCanvas()->clipPath(path, op, doAntiAlias);
+    return this->INHERITED::clipPath(path, op, doAntiAlias);
+}
+
+bool SkDeferredCanvas::clipRegion(const SkRegion& deviceRgn,
+                                  SkRegion::Op op) {
+    drawingCanvas()->clipRegion(deviceRgn, op);
+    return this->INHERITED::clipRegion(deviceRgn, op);
+}
+
+void SkDeferredCanvas::clear(SkColor color) {
+    // purge pending commands
+    if (fDeferredDrawing) {
+        getDeferredDevice()->contentsCleared();
+    }
+
+    drawingCanvas()->clear(color);
+}
+
+void SkDeferredCanvas::drawPaint(const SkPaint& paint) {
+    if (fDeferredDrawing && isFullFrame(NULL, &paint) && 
+        isPaintOpaque(&paint)) {
+        getDeferredDevice()->contentsCleared();
+    }
+
+    drawingCanvas()->drawPaint(paint);
+}
+
+void SkDeferredCanvas::drawPoints(PointMode mode, size_t count,
+                                  const SkPoint pts[], const SkPaint& paint) {
+    drawingCanvas()->drawPoints(mode, count, pts, paint);
+}
+
+void SkDeferredCanvas::drawRect(const SkRect& rect, const SkPaint& paint) {
+    if (fDeferredDrawing && isFullFrame(&rect, &paint) && 
+        isPaintOpaque(&paint)) {
+        getDeferredDevice()->contentsCleared();
+    }
+
+    drawingCanvas()->drawRect(rect, paint);
+}
+
+void SkDeferredCanvas::drawPath(const SkPath& path, const SkPaint& paint) {
+    drawingCanvas()->drawPath(path, paint);
+}
+
+void SkDeferredCanvas::drawBitmap(const SkBitmap& bitmap, SkScalar left,
+                                  SkScalar top, const SkPaint* paint) {
+    SkRect bitmapRect = SkRect::MakeXYWH(left, top,
+        SkIntToScalar(bitmap.width()), SkIntToScalar(bitmap.height()));
+    if (fDeferredDrawing && 
+        isFullFrame(&bitmapRect, paint) &&
+        isPaintOpaque(paint, &bitmap)) {
+        getDeferredDevice()->contentsCleared();
+    }
+
+    drawingCanvas()->drawBitmap(bitmap, left, top, paint);
+    flushIfNeeded(bitmap);
+}
+
+void SkDeferredCanvas::drawBitmapRect(const SkBitmap& bitmap, 
+                                      const SkIRect* src,
+                                      const SkRect& dst,
+                                      const SkPaint* paint) {
+    if (fDeferredDrawing && 
+        isFullFrame(&dst, paint) &&
+        isPaintOpaque(paint, &bitmap)) {
+        getDeferredDevice()->contentsCleared();
+    }
+
+    drawingCanvas()->drawBitmapRect(bitmap, src,
+                                    dst, paint);
+    flushIfNeeded(bitmap);
+}
+
+
+void SkDeferredCanvas::drawBitmapMatrix(const SkBitmap& bitmap,
+                                        const SkMatrix& m,
+                                        const SkPaint* paint) {
+    // TODO: reset recording canvas if paint+bitmap is opaque and clip rect
+    // covers canvas entirely and transformed bitmap covers canvas entirely
+    drawingCanvas()->drawBitmapMatrix(bitmap, m, paint);
+    flushIfNeeded(bitmap);
+}
+
+void SkDeferredCanvas::drawBitmapNine(const SkBitmap& bitmap,
+                                      const SkIRect& center, const SkRect& dst,
+                                      const SkPaint* paint) {
+    // TODO: reset recording canvas if paint+bitmap is opaque and clip rect
+    // covers canvas entirely and dst covers canvas entirely
+    drawingCanvas()->drawBitmapNine(bitmap, center,
+                                    dst, paint);
+    flushIfNeeded(bitmap);
+}
+
+void SkDeferredCanvas::drawSprite(const SkBitmap& bitmap, int left, int top,
+                                  const SkPaint* paint) {
+    SkRect bitmapRect = SkRect::MakeXYWH(
+        SkIntToScalar(left),
+        SkIntToScalar(top), 
+        SkIntToScalar(bitmap.width()),
+        SkIntToScalar(bitmap.height()));
+    if (fDeferredDrawing && 
+        isFullFrame(&bitmapRect, paint) &&
+        isPaintOpaque(paint, &bitmap)) {
+        getDeferredDevice()->contentsCleared();
+    }
+
+    drawingCanvas()->drawSprite(bitmap, left, top,
+                                paint);
+    flushIfNeeded(bitmap);
+}
+
+void SkDeferredCanvas::drawText(const void* text, size_t byteLength,
+                                SkScalar x, SkScalar y, const SkPaint& paint) {
+    drawingCanvas()->drawText(text, byteLength, x, y, paint);
+}
+
+void SkDeferredCanvas::drawPosText(const void* text, size_t byteLength,
+                                   const SkPoint pos[], const SkPaint& paint) {
+    drawingCanvas()->drawPosText(text, byteLength, pos, paint);
+}
+
+void SkDeferredCanvas::drawPosTextH(const void* text, size_t byteLength,
+                                    const SkScalar xpos[], SkScalar constY,
+                                    const SkPaint& paint) {
+    drawingCanvas()->drawPosTextH(text, byteLength, xpos, constY, paint);
+}
+
+void SkDeferredCanvas::drawTextOnPath(const void* text, size_t byteLength,
+                                      const SkPath& path,
+                                      const SkMatrix* matrix,
+                                      const SkPaint& paint) {
+    drawingCanvas()->drawTextOnPath(text, byteLength,
+                                    path, matrix,
+                                    paint);
+}
+
+void SkDeferredCanvas::drawPicture(SkPicture& picture) {
+    drawingCanvas()->drawPicture(picture);
+}
+
+void SkDeferredCanvas::drawVertices(VertexMode vmode, int vertexCount,
+                                    const SkPoint vertices[],
+                                    const SkPoint texs[],
+                                    const SkColor colors[], SkXfermode* xmode,
+                                    const uint16_t indices[], int indexCount,
+                                    const SkPaint& paint) {
+    drawingCanvas()->drawVertices(vmode, vertexCount,
+                                  vertices, texs,
+                                  colors, xmode,
+                                  indices, indexCount,
+                                  paint);
+}
+
+SkBounder* SkDeferredCanvas::setBounder(SkBounder* bounder) {
+    drawingCanvas()->setBounder(bounder);
+    return INHERITED::setBounder(bounder);
+}
+
+SkDrawFilter* SkDeferredCanvas::setDrawFilter(SkDrawFilter* filter) {
+    drawingCanvas()->setDrawFilter(filter); 
+    return INHERITED::setDrawFilter(filter);
+}
+
+SkCanvas* SkDeferredCanvas::canvasForDrawIter() {
+    return drawingCanvas();
+}
+
+// SkDeferredCanvas::DeferredDevice
+//------------------------------------
+
+SkDeferredCanvas::DeferredDevice::DeferredDevice(
+    SkDevice* immediateDevice, DeviceContext* deviceContext) :
+    SkDevice(SkBitmap::kNo_Config, immediateDevice->width(),
+             immediateDevice->height(), immediateDevice->isOpaque())
+    , fFreshFrame(true) {
+
+    fDeviceContext = deviceContext;
+    SkSafeRef(fDeviceContext);
+    fImmediateDevice = immediateDevice; // ref counted via fImmediateCanvas
+    fImmediateCanvas = SkNEW_ARGS(SkCanvas, (fImmediateDevice));
+    fRecordingCanvas = fPicture.beginRecording(fImmediateDevice->width(),
+        fImmediateDevice->height(), 0);
+}
+
+SkDeferredCanvas::DeferredDevice::~DeferredDevice() {
+    SkSafeUnref(fImmediateCanvas);
+    SkSafeUnref(fDeviceContext);
+}
+    
+void SkDeferredCanvas::DeferredDevice::setDeviceContext(
+    DeviceContext* deviceContext) {
+    SkRefCnt_SafeAssign(fDeviceContext, deviceContext);
+}
+
+void SkDeferredCanvas::DeferredDevice::contentsCleared() {
+    if (!fRecordingCanvas->isDrawingToLayer()) {
+        fFreshFrame = true;
+
+        // TODO: find a way to transfer the state stack and layers
+        // to the new recording canvas.  For now, purging only works
+        // with an empty stack.
+        if (fRecordingCanvas->getSaveCount() == 0) {
+
+            // Save state that is trashed by the purge
+            SkDrawFilter* drawFilter = fRecordingCanvas->getDrawFilter();
+            SkSafeRef(drawFilter); // So that it survives the purge
+            SkMatrix matrix = fRecordingCanvas->getTotalMatrix();
+            SkRegion clipRegion = fRecordingCanvas->getTotalClip();
+
+            // beginRecording creates a new recording canvas and discards the
+            // old one, hence purging deferred draw ops.
+            fRecordingCanvas = fPicture.beginRecording(
+                fImmediateDevice->width(),
+                fImmediateDevice->height(), 0);
+
+            // Restore pre-purge state
+            if (!clipRegion.isEmpty()) {
+                fRecordingCanvas->clipRegion(clipRegion, 
+                    SkRegion::kReplace_Op);
+            }
+            if (!matrix.isIdentity()) {
+                fRecordingCanvas->setMatrix(matrix);
+            }
+            if (drawFilter) {
+                fRecordingCanvas->setDrawFilter(drawFilter)->unref();
+            }
+        }
+    }
+}
+
+bool SkDeferredCanvas::DeferredDevice::isFreshFrame() {
+    bool ret = fFreshFrame;
+    fFreshFrame = false;
+    return ret;
+}
+
+void SkDeferredCanvas::DeferredDevice::flushPending() {
+    if (fDeviceContext) {
+        fDeviceContext->prepareForDraw();
+    }
+    fPicture.draw(fImmediateCanvas);
+    fRecordingCanvas = fPicture.beginRecording(fImmediateDevice->width(), 
+        fImmediateDevice->height(), 0);
+}
+
+void SkDeferredCanvas::DeferredDevice::flush() {
+    flushPending();
+    fImmediateCanvas->flush();
+}
+
+void SkDeferredCanvas::DeferredDevice::flushIfNeeded(const SkBitmap& bitmap) {
+    if (bitmap.isImmutable()) {
+        return; // safe to deffer without registering a dependency
+    }
+
+    // For now, drawing a writable bitmap triggers a flush
+    // TODO: implement read-only semantics and auto buffer duplication on write
+    // in SkBitmap/SkPixelRef, which will make deferral possible in this case.
+    flushPending();
+}
+
+uint32_t SkDeferredCanvas::DeferredDevice::getDeviceCapabilities() { 
+    return fImmediateDevice->getDeviceCapabilities();
+}
+
+int SkDeferredCanvas::DeferredDevice::width() const { 
+    return fImmediateDevice->width();
+}
+
+int SkDeferredCanvas::DeferredDevice::height() const {
+    return fImmediateDevice->height(); 
+}
+
+SkGpuRenderTarget* SkDeferredCanvas::DeferredDevice::accessRenderTarget() {
+    flushPending();
+    return fImmediateDevice->accessRenderTarget();
+}
+
+void SkDeferredCanvas::DeferredDevice::writePixels(const SkBitmap& bitmap,
+    int x, int y, SkCanvas::Config8888 config8888) {
+
+    if (x <= 0 && y <= 0 && (x + bitmap.width()) >= width() &&
+        (y + bitmap.height()) >= height()) {
+        contentsCleared();
+    }
+
+    if (SkBitmap::kARGB_8888_Config == bitmap.config() &&
+        SkCanvas::kNative_Premul_Config8888 != config8888 &&
+        kPMColorAlias != config8888) {
+        //Special case config: no deferral
+        flushPending();
+        fImmediateDevice->writePixels(bitmap, x, y, config8888);
+    }
+
+    SkPaint paint;
+    paint.setXfermodeMode(SkXfermode::kSrc_Mode);
+    fRecordingCanvas->drawSprite(bitmap, x, y, &paint);
+    flushIfNeeded(bitmap);
+}
+
+const SkBitmap& SkDeferredCanvas::DeferredDevice::onAccessBitmap(SkBitmap*) {
+    flushPending();
+    return fImmediateDevice->accessBitmap(false);
+}
+
+SkDevice* SkDeferredCanvas::DeferredDevice::onCreateCompatibleDevice(
+    SkBitmap::Config config, int width, int height, bool isOpaque,
+    Usage usage) {
+
+    // Save layer usage not supported, and not required by SkDeferredCanvas.
+    SkASSERT(usage != kSaveLayer_Usage);
+    // Create a compatible non-deferred device.
+    SkDevice* compatibleDevice = 
+        fImmediateDevice->createCompatibleDevice(config, width, height, 
+            isOpaque);
+    return SkNEW_ARGS(DeferredDevice, (compatibleDevice, fDeviceContext));
+}
+
+bool SkDeferredCanvas::DeferredDevice::onReadPixels(
+    const SkBitmap& bitmap, int x, int y, SkCanvas::Config8888 config8888) {
+    flushPending();
+    return fImmediateCanvas->readPixels(const_cast<SkBitmap*>(&bitmap),
+                                                   x, y, config8888);
+}
diff --git a/src/utils/SkDumpCanvas.cpp b/src/utils/SkDumpCanvas.cpp
index db62498..c9b4a95 100644
--- a/src/utils/SkDumpCanvas.cpp
+++ b/src/utils/SkDumpCanvas.cpp
@@ -49,7 +49,7 @@
                              pts[2].fX, pts[2].fY, pts[3].fX, pts[3].fY);
                 break;
             case SkPath::kClose_Verb:
-                str->appendf("X");
+                str->append("X");
                 break;
             case SkPath::kDone_Verb:
                 return;
diff --git a/src/utils/SkNWayCanvas.cpp b/src/utils/SkNWayCanvas.cpp
index 24c992d..06cf32d 100644
--- a/src/utils/SkNWayCanvas.cpp
+++ b/src/utils/SkNWayCanvas.cpp
@@ -49,7 +49,7 @@
         return false;
     }
     SkCanvas* operator->() { return fCanvas; }
-    
+
 private:
     const SkTDArray<SkCanvas*>& fList;
     int fIndex;
diff --git a/src/views/SkEventSink.cpp b/src/views/SkEventSink.cpp
index 0b01669..20d8cdf 100644
--- a/src/views/SkEventSink.cpp
+++ b/src/views/SkEventSink.cpp
@@ -260,7 +260,7 @@
 #include "SkTDict.h"
 
 #define kMinStringBufferSize    128
-static SkMutex                  gNamedSinkMutex;
+SK_DECLARE_STATIC_MUTEX(gNamedSinkMutex);
 static SkTDict<SkEventSinkID>   gNamedSinkIDs(kMinStringBufferSize);
 
 /** Register a name/id pair with the system. If the name already exists,
diff --git a/tests/AAClipTest.cpp b/tests/AAClipTest.cpp
index b3051fd..4f3f759 100644
--- a/tests/AAClipTest.cpp
+++ b/tests/AAClipTest.cpp
@@ -272,15 +272,68 @@
             }
             REPORTER_ASSERT(reporter, nonEmptyAA == nonEmptyBW);
             REPORTER_ASSERT(reporter, clip2.getBounds() == rgn2.getBounds());
+            
+            SkMask maskBW, maskAA;
+            copyToMask(rgn2, &maskBW);
+            clip2.copyToMask(&maskAA);
+            REPORTER_ASSERT(reporter, maskBW == maskAA);
         }
     }
 }
 
+static void test_path_with_hole(skiatest::Reporter* reporter) {
+    static const uint8_t gExpectedImage[] = {
+        0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF,
+        0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00,
+        0xFF, 0xFF, 0xFF, 0xFF,
+        0xFF, 0xFF, 0xFF, 0xFF,
+    };
+    SkMask expected;
+    expected.fBounds.set(0, 0, 4, 6);
+    expected.fRowBytes = 4;
+    expected.fFormat = SkMask::kA8_Format;
+    expected.fImage = (uint8_t*)gExpectedImage;
+
+    SkPath path;
+    path.addRect(SkRect::MakeXYWH(0, 0,
+                                  SkIntToScalar(4), SkIntToScalar(2)));
+    path.addRect(SkRect::MakeXYWH(0, SkIntToScalar(4),
+                                  SkIntToScalar(4), SkIntToScalar(2)));
+
+    for (int i = 0; i < 2; ++i) {
+        SkAAClip clip;
+        clip.setPath(path, NULL, 1 == i);
+        
+        SkMask mask;
+        clip.copyToMask(&mask);
+        
+        REPORTER_ASSERT(reporter, expected == mask);
+    }
+}
+
+static void test_regressions(skiatest::Reporter* reporter) {
+    // these should not assert in the debug build
+    // bug was introduced in rev. 3209
+    {
+        SkAAClip clip;
+        SkRect r;
+        r.fLeft = SkFloatToScalar(129.892181);
+        r.fTop = SkFloatToScalar(10.3999996);
+        r.fRight = SkFloatToScalar(130.892181); 
+        r.fBottom = SkFloatToScalar(20.3999996);
+        clip.setRect(r, true);
+    }
+}
+
 static void TestAAClip(skiatest::Reporter* reporter) {
     test_empty(reporter);
     test_path_bounds(reporter);
     test_irect(reporter);
     test_rgn(reporter);
+    test_path_with_hole(reporter);
+    test_regressions(reporter);
 }
 
 #include "TestClassDef.h"
diff --git a/tests/Android.mk b/tests/Android.mk
index 14d23f8..8487699 100644
--- a/tests/Android.mk
+++ b/tests/Android.mk
@@ -16,11 +16,14 @@
   ColorFilterTest.cpp \
   ColorTest.cpp \
   DataRefTest.cpp \
+  DeferredCanvasTest.cpp \
   DequeTest.cpp \
   DrawBitmapRectTest.cpp \
+  DrawTextTest.cpp \
   EmptyPathTest.cpp \
   FillPathTest.cpp \
   FlateTest.cpp \
+  FontHostTest.cpp \
   GeometryTest.cpp \
   GLInterfaceValidation.cpp \
   GLProgramsTest.cpp \
@@ -37,6 +40,7 @@
   PathMeasureTest.cpp \
   PathTest.cpp \
   PointTest.cpp \
+  PremulAlphaRoundTripTest.cpp \
   QuickRejectTest.cpp \
   Reader32Test.cpp \
   ReadPixelsTest.cpp \
diff --git a/tests/BitmapCopyTest.cpp b/tests/BitmapCopyTest.cpp
index b8d16bf..d5fd7df 100644
--- a/tests/BitmapCopyTest.cpp
+++ b/tests/BitmapCopyTest.cpp
@@ -308,12 +308,16 @@
                 }
                 // test extractSubset
                 {
+                    SkBitmap bitmap(src);
                     SkBitmap subset;
                     SkIRect r;
                     r.set(1, 1, 2, 2);
-                    if (src.extractSubset(&subset, r)) {
+                    bitmap.setIsVolatile(true);
+                    if (bitmap.extractSubset(&subset, r)) {
                         REPORTER_ASSERT(reporter, subset.width() == 1);
                         REPORTER_ASSERT(reporter, subset.height() == 1);
+                        REPORTER_ASSERT(reporter,
+                                        subset.isVolatile() == true);
 
                         SkBitmap copy;
                         REPORTER_ASSERT(reporter,
@@ -329,6 +333,11 @@
                         REPORTER_ASSERT(reporter,
                                     (copy.getColorTable() != NULL) == hasCT);
                     }
+                    bitmap.setIsVolatile(false);
+                    if (bitmap.extractSubset(&subset, r)) {
+                        REPORTER_ASSERT(reporter,
+                                        subset.isVolatile() == false);
+                    }
                 }
             } else {
                 // dst should be unchanged from its initial state
diff --git a/tests/CanvasTest.cpp b/tests/CanvasTest.cpp
index da1fafd..7cafda2 100644
--- a/tests/CanvasTest.cpp
+++ b/tests/CanvasTest.cpp
@@ -1,66 +1,759 @@
 
 /*
- * Copyright 2011 Google Inc.
+ * Copyright 2012 Google Inc.
  *
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  */
-#include "Test.h"
+
+/*  Description:
+ *      This test defines a series of elementatry test steps that perform
+ *      a single or a small group of canvas API calls. Each test step is
+ *      used in several test cases that verify that different types of SkCanvas
+ *      flavors and derivatives pass it and yield consistent behavior. The
+ *      test cases analyse results that are queryable through the API. They do
+ *      not look at rendering results.
+ *
+ *  Adding test stepss:
+ *      The general pattern for creating a new test step is to write a test
+ *      function of the form:
+ *
+ *          static void MyTestStepFunction(SkCanvas* canvas, 
+ *                                         skiatest::Reporter* reporter,
+ *                                         CanvasTestStep* testStep)
+ *          {
+ *              canvas->someCanvasAPImethod();
+ *              (...)
+ *              REPORTER_ASSERT_MESSAGE(reporter, (...), \
+ *                  testStep->assertMessage());
+ *          }
+ *
+ *      The definition of the test step function should be followed by an
+ *      invocation of the TEST_STEP macro, which generates a class and
+ *      instance for the test step:
+ *
+ *          TEST_STEP(MyTestStep, MyTestStepFunction)
+ *
+ *      There are also short hand macros for defining simple test steps
+ *      in a single line of code.  A simple test step is a one that is made
+ *      of a single canvas API call.
+ *
+ *          SIMPLE_TEST_STEP(MytestStep, someCanvasAPIMethod());
+ *
+ *      There is another macro called SIMPLE_TEST_STEP_WITH_ASSERT that
+ *      works the same way as SIMPLE_TEST_STEP, and additionally verifies
+ *      that the invoked method returns a non-zero value.
+ */
 #include "SkBitmap.h"
 #include "SkCanvas.h"
+#include "SkDeferredCanvas.h"
+#include "SkDevice.h"
+#include "SkMatrix.h"
+#include "SkNWayCanvas.h"
+#include "SkPaint.h"
+#include "SkPath.h"
+#include "SkPicture.h"
+#include "SkPictureRecord.h"
+#include "SkProxyCanvas.h"
+#include "SkRect.h"
+#include "SkRegion.h"
+#include "SkShader.h"
+#include "SkStream.h"
+#include "SkTDArray.h"
+#include "Test.h"
 
-static void test_isDrawingToLayer(skiatest::Reporter* reporter) {
-    SkBitmap bm;
-    bm.setConfig(SkBitmap::kARGB_8888_Config, 256, 256);
-    bm.allocPixels();
-    
-    SkCanvas canvas(bm);
+static const int kWidth = 2;
+static const int kHeight = 2;
+// Maximum stream length for picture serialization
+static const size_t kMaxPictureBufferSize = 1024; 
 
-    REPORTER_ASSERT(reporter, !canvas.isDrawingToLayer());
-    canvas.save();
-    REPORTER_ASSERT(reporter, !canvas.isDrawingToLayer());
+// Format strings that describe the test context.  The %s token is where
+// the name of the test step is inserted.  The context is required for
+// disambiguating the error in the case of failures that are reported in
+// functions that are called multiple times in different contexts (test
+// cases and test steps).
+static const char* const kDefaultAssertMessageFormat = "%s";
+static const char* const kCanvasDrawAssertMessageFormat = 
+    "Drawing test step %s with SkCanvas";
+static const char* const kPictureDrawAssertMessageFormat = 
+    "Drawing test step %s with SkPicture";
+static const char* const kPictureSecondDrawAssertMessageFormat = 
+    "Duplicate draw of test step %s with SkPicture";
+static const char* const kPictureReDrawAssertMessageFormat = 
+    "Playing back test step %s from an SkPicture to another SkPicture";
+static const char* const kDeferredDrawAssertMessageFormat = 
+    "Drawing test step %s with SkDeferredCanvas";
+static const char* const kProxyDrawAssertMessageFormat = 
+    "Drawing test step %s with SkProxyCanvas";
+static const char* const kNWayDrawAssertMessageFormat = 
+    "Drawing test step %s with SkNWayCanvas";
+static const char* const kRoundTripAssertMessageFormat = 
+    "test step %s, SkPicture consistency after round trip";
+static const char* const kPictureRecoringAssertMessageFormat = 
+    "test step %s, SkPicture state consistency after recording";
+static const char* const kPicturePlaybackAssertMessageFormat = 
+    "test step %s, SkPicture state consistency in playback canvas";
+static const char* const kDeferredPreFlushAssertMessageFormat = 
+    "test step %s, SkDeferredCanvas state consistency before flush";
+static const char* const kDeferredPostFlushAssertMessageFormat = 
+    "test step %s, SkDeferredCanvas state consistency after flush";
+static const char* const kPictureResourceReuseMessageFormat =
+    "test step %s, SkPicture duplicate flattened object test";
+static const char* const kProxyStateAssertMessageFormat =
+    "test step %s, SkProxyCanvas state consistency";
+static const char* const kProxyIndirectStateAssertMessageFormat =
+    "test step %s, SkProxyCanvas indirect canvas state consistency";
+static const char* const kNWayStateAssertMessageFormat =
+    "test step %s, SkNWayCanvas state consistency";
+static const char* const kNWayIndirect1StateAssertMessageFormat =
+    "test step %s, SkNWayCanvas indirect canvas 1 state consistency";
+static const char* const kNWayIndirect2StateAssertMessageFormat =
+    "test step %s, SkNWayCanvas indirect canvas 2 state consistency";
+
+static void createBitmap(SkBitmap* bm, SkBitmap::Config config, SkColor color) {
+    bm->setConfig(config, kWidth, kHeight);
+    bm->allocPixels();
+    bm->eraseColor(color);
+}
+
+class CanvasTestStep;
+static SkTDArray<CanvasTestStep*>& testStepArray() {
+    static SkTDArray<CanvasTestStep*> theTests;
+    return theTests;
+}
+
+class CanvasTestStep {
+public:
+    CanvasTestStep() {
+        *testStepArray().append() = this;
+        fAssertMessageFormat = kDefaultAssertMessageFormat;
+    }
+    virtual ~CanvasTestStep() { }
+
+    virtual void draw(SkCanvas*, skiatest::Reporter*) = 0;
+    virtual const char* name() const = 0;
+
+    const char* assertMessage() {
+        fAssertMessage.printf(fAssertMessageFormat, name());
+        return fAssertMessage.c_str();
+    }
+
+    void setAssertMessageFormat(const char* format) {
+        fAssertMessageFormat = format;
+    }
+
+private:
+    SkString fAssertMessage;
+    const char* fAssertMessageFormat;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Constants used by test steps
+
+const SkRect kTestRect = 
+    SkRect::MakeXYWH(SkIntToScalar(0), SkIntToScalar(0),
+                     SkIntToScalar(2), SkIntToScalar(1));
+static SkMatrix testMatrix() {
+    SkMatrix matrix;
+    matrix.reset();
+    matrix.setScale(SkIntToScalar(2), SkIntToScalar(3));
+    return matrix;
+}
+const SkMatrix kTestMatrix = testMatrix();
+static SkPath testPath() {
+    SkPath path;
+    path.addRect(SkRect::MakeXYWH(SkIntToScalar(0), SkIntToScalar(0),
+                                  SkIntToScalar(2), SkIntToScalar(1)));
+    return path;
+}
+const SkPath kTestPath = testPath();
+static SkRegion testRegion() {
+    SkRegion region;
+    SkIRect rect = SkIRect::MakeXYWH(0, 0, 2, 1);
+    region.setRect(rect);
+    return region;
+}
+const SkIRect kTestIRect = SkIRect::MakeXYWH(0, 0, 2, 1);
+const SkRegion kTestRegion = testRegion();
+const SkColor kTestColor = 0x01020304;
+const SkPaint kTestPaint;
+const SkPoint kTestPoints[3] = {
+    {SkIntToScalar(0), SkIntToScalar(0)},
+    {SkIntToScalar(2), SkIntToScalar(1)},
+    {SkIntToScalar(0), SkIntToScalar(2)}
+};
+const size_t kTestPointCount = 3;
+static SkBitmap testBitmap() {
+    SkBitmap bitmap;
+    createBitmap(&bitmap, SkBitmap::kARGB_8888_Config, 0x05060708);
+    return bitmap;
+}
+SkBitmap kTestBitmap; // cannot be created during static init
+SkString kTestText("Hello World");
+SkPoint kTestPoint = SkPoint::Make(SkIntToScalar(0), SkIntToScalar(1));
+
+///////////////////////////////////////////////////////////////////////////////
+// Macros for defining test steps
+
+#define TEST_STEP(NAME, FUNCTION)                                       \
+class NAME##_TestStep : public CanvasTestStep{                          \
+public:                                                                 \
+    virtual void draw(SkCanvas* canvas, skiatest::Reporter* reporter) { \
+        FUNCTION (canvas, reporter, this);                              \
+    }                                                                   \
+    virtual const char* name() const {return #NAME ;}                   \
+};                                                                      \
+static NAME##_TestStep NAME##_TestStepInstance;
+
+#define SIMPLE_TEST_STEP(NAME, CALL)                              \
+static void NAME##TestStep(SkCanvas* canvas, skiatest::Reporter*, \
+    CanvasTestStep*) {                                            \
+    canvas-> CALL ;                                               \
+}                                                                 \
+TEST_STEP(NAME, NAME##TestStep )
+
+#define SIMPLE_TEST_STEP_WITH_ASSERT(NAME, CALL)                           \
+static void NAME##TestStep(SkCanvas* canvas, skiatest::Reporter* reporter, \
+    CanvasTestStep* testStep) {                                            \
+    REPORTER_ASSERT_MESSAGE(reporter, canvas-> CALL ,                      \
+        testStep->assertMessage());                                        \
+}                                                                          \
+TEST_STEP(NAME, NAME##TestStep )
+
+
+///////////////////////////////////////////////////////////////////////////////
+// Basic test steps for most virtual methods in SkCanvas that draw or affect 
+// the state of the canvas.
+
+SIMPLE_TEST_STEP(SaveMatrix, save(SkCanvas::kMatrix_SaveFlag));
+SIMPLE_TEST_STEP(SaveClip, save(SkCanvas::kClip_SaveFlag));
+SIMPLE_TEST_STEP(SaveMatrixClip, save(SkCanvas::kMatrixClip_SaveFlag));
+SIMPLE_TEST_STEP(SaveLayer, saveLayer(NULL, NULL));
+SIMPLE_TEST_STEP(BoundedSaveLayer, saveLayer(&kTestRect, NULL));
+SIMPLE_TEST_STEP(PaintSaveLayer, saveLayer(NULL, &kTestPaint));
+SIMPLE_TEST_STEP_WITH_ASSERT(Translate,
+    translate(SkIntToScalar(1), SkIntToScalar(2)));
+SIMPLE_TEST_STEP_WITH_ASSERT(Scale,
+    scale(SkIntToScalar(1), SkIntToScalar(2)));
+SIMPLE_TEST_STEP_WITH_ASSERT(Rotate, rotate(SkIntToScalar(1)));
+SIMPLE_TEST_STEP_WITH_ASSERT(Skew,
+    skew(SkIntToScalar(1), SkIntToScalar(2)));
+SIMPLE_TEST_STEP_WITH_ASSERT(Concat, concat(kTestMatrix));
+SIMPLE_TEST_STEP(SetMatrix, setMatrix(kTestMatrix));
+SIMPLE_TEST_STEP_WITH_ASSERT(ClipRect, clipRect(kTestRect));
+SIMPLE_TEST_STEP_WITH_ASSERT(ClipPath, clipPath(kTestPath));
+SIMPLE_TEST_STEP_WITH_ASSERT(ClipRegion,
+    clipRegion(kTestRegion, SkRegion::kReplace_Op));
+SIMPLE_TEST_STEP(Clear, clear(kTestColor));
+SIMPLE_TEST_STEP(DrawPaint, drawPaint(kTestPaint));
+SIMPLE_TEST_STEP(DrawPointsPoints, drawPoints(SkCanvas::kPoints_PointMode,
+    kTestPointCount, kTestPoints, kTestPaint));
+SIMPLE_TEST_STEP(DrawPointsLiness, drawPoints(SkCanvas::kLines_PointMode,
+    kTestPointCount, kTestPoints, kTestPaint));
+SIMPLE_TEST_STEP(DrawPointsPolygon, drawPoints(SkCanvas::kPolygon_PointMode,
+    kTestPointCount, kTestPoints, kTestPaint));
+SIMPLE_TEST_STEP(DrawRect, drawRect(kTestRect, kTestPaint));
+SIMPLE_TEST_STEP(DrawPath, drawPath(kTestPath, kTestPaint));
+SIMPLE_TEST_STEP(DrawBitmap, drawBitmap(kTestBitmap, 0, 0));
+SIMPLE_TEST_STEP(DrawBitmapPaint, drawBitmap(kTestBitmap, 0, 0, &kTestPaint));
+SIMPLE_TEST_STEP(DrawBitmapRect, drawBitmapRect(kTestBitmap, NULL, kTestRect,
+    NULL));
+SIMPLE_TEST_STEP(DrawBitmapRectSrcRect, drawBitmapRect(kTestBitmap,
+    &kTestIRect, kTestRect, NULL));
+SIMPLE_TEST_STEP(DrawBitmapRectPaint, drawBitmapRect(kTestBitmap, NULL,
+    kTestRect, &kTestPaint));
+SIMPLE_TEST_STEP(DrawBitmapMatrix, drawBitmapMatrix(kTestBitmap, kTestMatrix,
+    NULL));
+SIMPLE_TEST_STEP(DrawBitmapMatrixPaint, drawBitmapMatrix(kTestBitmap,
+    kTestMatrix, &kTestPaint));
+SIMPLE_TEST_STEP(DrawBitmapNine, drawBitmapNine(kTestBitmap, kTestIRect,
+    kTestRect, NULL));
+SIMPLE_TEST_STEP(DrawBitmapNinePaint, drawBitmapNine(kTestBitmap, kTestIRect,
+    kTestRect, &kTestPaint));
+SIMPLE_TEST_STEP(DrawSprite, drawSprite(kTestBitmap, 0, 0, NULL));
+SIMPLE_TEST_STEP(DrawSpritePaint, drawSprite(kTestBitmap, 0, 0, &kTestPaint));
+SIMPLE_TEST_STEP(DrawText, drawText(kTestText.c_str(), kTestText.size(),
+    0, 1, kTestPaint));
+SIMPLE_TEST_STEP(DrawPosText, drawPosText(kTestText.c_str(),
+    kTestText.size(), &kTestPoint, kTestPaint));
+SIMPLE_TEST_STEP(DrawTextOnPath, drawTextOnPath(kTestText.c_str(),
+    kTestText.size(), kTestPath, NULL, kTestPaint));
+SIMPLE_TEST_STEP(DrawTextOnPathMatrix, drawTextOnPath(kTestText.c_str(),
+    kTestText.size(), kTestPath, &kTestMatrix, kTestPaint));
+SIMPLE_TEST_STEP(SetExternalMatrix, setExternalMatrix(&kTestMatrix));
+SIMPLE_TEST_STEP(DrawData, drawData(kTestText.c_str(), kTestText.size()));
+
+///////////////////////////////////////////////////////////////////////////////
+// Complex test steps
+
+static void DrawVerticesShaderTestStep(SkCanvas* canvas, 
+                                       skiatest::Reporter* reporter,
+                                       CanvasTestStep* testStep) {
+    SkPoint pts[4];
+    pts[0].set(0, 0);
+    pts[1].set(SkIntToScalar(kWidth), 0);
+    pts[2].set(SkIntToScalar(kWidth), SkIntToScalar(kHeight));
+    pts[3].set(0, SkIntToScalar(kHeight));
+    SkPaint paint;
+    SkShader* shader = SkShader::CreateBitmapShader(kTestBitmap,
+        SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);
+    paint.setShader(shader)->unref();
+    canvas->drawVertices(SkCanvas::kTriangleFan_VertexMode, 4, pts, pts,
+                         NULL, NULL, NULL, 0, paint);
+}
+TEST_STEP(DrawVerticesShader, DrawVerticesShaderTestStep);
+
+static void DrawPictureTestStep(SkCanvas* canvas, 
+                                skiatest::Reporter* reporter,
+                                CanvasTestStep* testStep) {
+    SkPicture* testPicture = SkNEW_ARGS(SkPicture, ());
+    SkAutoUnref aup(testPicture);
+    SkCanvas* testCanvas = testPicture->beginRecording(kWidth, kHeight);
+    testCanvas->scale(SkIntToScalar(2), SkIntToScalar(1));
+    testCanvas->clipRect(kTestRect);
+    testCanvas->drawRect(kTestRect, kTestPaint);
+    canvas->drawPicture(*testPicture);
+}
+TEST_STEP(DrawPicture, DrawPictureTestStep);
+
+static void SaveRestoreTestStep(SkCanvas* canvas, 
+                                skiatest::Reporter* reporter,
+                                CanvasTestStep* testStep) {
+    REPORTER_ASSERT_MESSAGE(reporter, 1 == canvas->getSaveCount(), 
+        testStep->assertMessage());
+    size_t n = canvas->save();
+    REPORTER_ASSERT_MESSAGE(reporter, 1 == n, testStep->assertMessage());
+    REPORTER_ASSERT_MESSAGE(reporter, 2 == canvas->getSaveCount(),
+        testStep->assertMessage());
+    canvas->save();
+    canvas->save();
+    REPORTER_ASSERT_MESSAGE(reporter, 4 == canvas->getSaveCount(),
+        testStep->assertMessage());
+    canvas->restoreToCount(2);
+    REPORTER_ASSERT_MESSAGE(reporter, 2 == canvas->getSaveCount(),
+        testStep->assertMessage());
+
+    // should this pin to 1, or be a no-op, or crash?
+    canvas->restoreToCount(0);
+    REPORTER_ASSERT_MESSAGE(reporter, 1 == canvas->getSaveCount(),
+        testStep->assertMessage());
+}
+TEST_STEP(SaveRestore, SaveRestoreTestStep);
+
+static void DrawLayerTestStep(SkCanvas* canvas, 
+                              skiatest::Reporter* reporter,
+                              CanvasTestStep* testStep) {
+    REPORTER_ASSERT_MESSAGE(reporter, !canvas->isDrawingToLayer(),
+        testStep->assertMessage());
+    canvas->save();
+    REPORTER_ASSERT_MESSAGE(reporter, !canvas->isDrawingToLayer(),
+        testStep->assertMessage());
     
     const SkRect* bounds = NULL;    // null means include entire bounds
     const SkPaint* paint = NULL;
 
-    canvas.saveLayer(bounds, paint);
-    REPORTER_ASSERT(reporter, canvas.isDrawingToLayer());
-    canvas.restore();
-    REPORTER_ASSERT(reporter, !canvas.isDrawingToLayer());
+    canvas->saveLayer(bounds, paint);
+    REPORTER_ASSERT_MESSAGE(reporter, canvas->isDrawingToLayer(),
+        testStep->assertMessage());
+    canvas->restore();
+    REPORTER_ASSERT_MESSAGE(reporter, !canvas->isDrawingToLayer(),
+        testStep->assertMessage());
 
-    canvas.saveLayer(bounds, paint);
-    canvas.saveLayer(bounds, paint);
-    REPORTER_ASSERT(reporter, canvas.isDrawingToLayer());
-    canvas.restore();
-    REPORTER_ASSERT(reporter, canvas.isDrawingToLayer());
-    canvas.restore();
+    canvas->saveLayer(bounds, paint);
+    canvas->saveLayer(bounds, paint);
+    REPORTER_ASSERT_MESSAGE(reporter, canvas->isDrawingToLayer(),
+        testStep->assertMessage());
+    canvas->restore();
+    REPORTER_ASSERT_MESSAGE(reporter, canvas->isDrawingToLayer(),
+        testStep->assertMessage());
+    canvas->restore();
     // now layer count should be 0
-    REPORTER_ASSERT(reporter, !canvas.isDrawingToLayer());
+    REPORTER_ASSERT_MESSAGE(reporter, !canvas->isDrawingToLayer(),
+        testStep->assertMessage());
+}
+TEST_STEP(DrawLayer, DrawLayerTestStep);
+
+static void AssertCanvasStatesEqual(skiatest::Reporter* reporter,
+                                    const SkCanvas* canvas1, 
+                                    const SkCanvas* canvas2,
+                                    CanvasTestStep* testStep) {
+    REPORTER_ASSERT_MESSAGE(reporter, canvas1->getDeviceSize() ==
+        canvas2->getDeviceSize(), testStep->assertMessage());
+    REPORTER_ASSERT_MESSAGE(reporter, canvas1->getSaveCount() ==
+        canvas2->getSaveCount(), testStep->assertMessage());
+    REPORTER_ASSERT_MESSAGE(reporter, canvas1->isDrawingToLayer() ==
+        canvas2->isDrawingToLayer(), testStep->assertMessage());
+    SkRect bounds1, bounds2;
+    REPORTER_ASSERT_MESSAGE(reporter,
+        canvas1->getClipBounds(&bounds1, SkCanvas::kAA_EdgeType) ==
+        canvas2->getClipBounds(&bounds2, SkCanvas::kAA_EdgeType),
+        testStep->assertMessage());
+    REPORTER_ASSERT_MESSAGE(reporter, bounds1 == bounds2,
+        testStep->assertMessage());
+    REPORTER_ASSERT_MESSAGE(reporter,
+        canvas1->getClipBounds(&bounds1, SkCanvas::kBW_EdgeType) ==
+        canvas2->getClipBounds(&bounds2, SkCanvas::kBW_EdgeType),
+        testStep->assertMessage());
+    REPORTER_ASSERT_MESSAGE(reporter, bounds1 == bounds2,
+        testStep->assertMessage());
+    REPORTER_ASSERT_MESSAGE(reporter, canvas1->getDrawFilter() ==
+        canvas2->getDrawFilter(), testStep->assertMessage());
+    SkIRect deviceBounds1, deviceBounds2;
+    REPORTER_ASSERT_MESSAGE(reporter,
+        canvas1->getClipDeviceBounds(&deviceBounds1) ==
+        canvas2->getClipDeviceBounds(&deviceBounds2),
+        testStep->assertMessage());
+    REPORTER_ASSERT_MESSAGE(reporter, deviceBounds1 == deviceBounds2,
+        testStep->assertMessage());
+    REPORTER_ASSERT_MESSAGE(reporter, canvas1->getBounder() ==
+        canvas2->getBounder(), testStep->assertMessage());
+    REPORTER_ASSERT_MESSAGE(reporter, canvas1->getTotalMatrix() ==
+        canvas2->getTotalMatrix(), testStep->assertMessage());
+    REPORTER_ASSERT_MESSAGE(reporter, canvas1->getClipType() ==
+        canvas2->getClipType(), testStep->assertMessage());
+    REPORTER_ASSERT_MESSAGE(reporter, canvas1->getTotalClip() ==
+        canvas2->getTotalClip(), testStep->assertMessage());
+    REPORTER_ASSERT_MESSAGE(reporter, canvas1->getTotalClipStack() ==
+        canvas2->getTotalClipStack(), testStep->assertMessage());
+
+    // The following test code is commented out because the test fails when
+    // the canvas is an SkPictureRecord or SkDeferredCanvas 
+    // Issue: http://code.google.com/p/skia/issues/detail?id=498
+    // Also, creating a LayerIter on an SkProxyCanvas crashes
+    // Issue: http://code.google.com/p/skia/issues/detail?id=499
+    /*
+    SkCanvas::LayerIter layerIter1(const_cast<SkCanvas*>(canvas1), false);
+    SkCanvas::LayerIter layerIter2(const_cast<SkCanvas*>(canvas2), false);
+    while (!layerIter1.done() && !layerIter2.done()) {
+        REPORTER_ASSERT_MESSAGE(reporter, layerIter1.matrix() ==
+            layerIter2.matrix(), testStep->assertMessage());
+        REPORTER_ASSERT_MESSAGE(reporter, layerIter1.clip() ==
+            layerIter2.clip(), testStep->assertMessage());
+        REPORTER_ASSERT_MESSAGE(reporter, layerIter1.paint() ==
+            layerIter2.paint(), testStep->assertMessage());
+        REPORTER_ASSERT_MESSAGE(reporter, layerIter1.x() ==
+            layerIter2.x(), testStep->assertMessage());
+        REPORTER_ASSERT_MESSAGE(reporter, layerIter1.y() ==
+            layerIter2.y(), testStep->assertMessage());
+        layerIter1.next();
+        layerIter2.next();
+    }
+    REPORTER_ASSERT_MESSAGE(reporter, layerIter1.done(),
+        testStep->assertMessage());
+    REPORTER_ASSERT_MESSAGE(reporter, layerIter2.done(),
+        testStep->assertMessage());
+    */
+}
+
+// The following class groups static functions that need to access
+// the privates members of SkPictureRecord
+class SkPictureTester {
+private:
+    static void AssertFlattenedObjectsEqual(
+        SkPictureRecord* referenceRecord,
+        SkPictureRecord* testRecord,
+        skiatest::Reporter* reporter,
+        CanvasTestStep* testStep) {
+
+        REPORTER_ASSERT_MESSAGE(reporter,
+            referenceRecord->fBitmaps.count() ==
+            testRecord->fBitmaps.count(), testStep->assertMessage());
+        for (int i = 0; i < referenceRecord->fBitmaps.count(); ++i) {
+            REPORTER_ASSERT_MESSAGE(reporter,
+                SkFlatData::Compare(referenceRecord->fBitmaps[i],
+                testRecord->fBitmaps[i]) == 0, testStep->assertMessage());
+        }
+        REPORTER_ASSERT_MESSAGE(reporter,
+            referenceRecord->fMatrices.count() ==
+            testRecord->fMatrices.count(), testStep->assertMessage());
+        for (int i = 0; i < referenceRecord->fMatrices.count(); ++i) {
+            REPORTER_ASSERT_MESSAGE(reporter,
+                SkFlatData::Compare(referenceRecord->fMatrices[i],
+                testRecord->fMatrices[i]) == 0,
+                testStep->assertMessage());
+        }
+        REPORTER_ASSERT_MESSAGE(reporter,
+            referenceRecord->fPaints.count() ==
+            testRecord->fPaints.count(), testStep->assertMessage());
+        for (int i = 0; i < referenceRecord->fPaints.count(); ++i) {
+            REPORTER_ASSERT_MESSAGE(reporter,
+                SkFlatData::Compare(referenceRecord->fPaints[i],
+                testRecord->fPaints[i]) == 0, testStep->assertMessage());
+        }
+        REPORTER_ASSERT_MESSAGE(reporter,
+            referenceRecord->fRegions.count() ==
+            testRecord->fRegions.count(), testStep->assertMessage());
+        for (int i = 0; i < referenceRecord->fRegions.count(); ++i) {
+            REPORTER_ASSERT_MESSAGE(reporter,
+                SkFlatData::Compare(referenceRecord->fRegions[i],
+                testRecord->fRegions[i]) == 0, testStep->assertMessage());
+        }
+        REPORTER_ASSERT_MESSAGE(reporter,
+            !referenceRecord->fPathHeap ==
+            !testRecord->fPathHeap,
+            testStep->assertMessage());
+        // The following tests are commented out because they currently
+        // fail. Issue: http://code.google.com/p/skia/issues/detail?id=507
+        /*
+        if (referenceRecord->fPathHeap) {
+            REPORTER_ASSERT_MESSAGE(reporter,
+                referenceRecord->fPathHeap->count() ==
+                testRecord->fPathHeap->count(),
+                testStep->assertMessage());
+            for (int i = 0; i < referenceRecord->fPathHeap->count(); ++i) {
+                REPORTER_ASSERT_MESSAGE(reporter,
+                    (*referenceRecord->fPathHeap)[i] ==
+                    (*testRecord->fPathHeap)[i], testStep->assertMessage());
+            }
+        }
+        */
+    
+    }
+
+public:
+
+    static void TestPictureSerializationRoundTrip(skiatest::Reporter* reporter, 
+                                                  CanvasTestStep* testStep) {
+        testStep->setAssertMessageFormat(kPictureDrawAssertMessageFormat);
+        SkPicture referencePicture;
+        testStep->draw(referencePicture.beginRecording(kWidth, kHeight),
+            reporter);
+        SkPicture initialPicture;
+        testStep->draw(initialPicture.beginRecording(kWidth, kHeight),
+            reporter);
+        testStep->setAssertMessageFormat(kPictureReDrawAssertMessageFormat);
+        SkPicture roundTripPicture;
+        initialPicture.draw(roundTripPicture.beginRecording(kWidth, kHeight));
+
+        SkPictureRecord* referenceRecord = static_cast<SkPictureRecord*>(
+            referencePicture.getRecordingCanvas());
+        SkPictureRecord* roundTripRecord = static_cast<SkPictureRecord*>(
+            roundTripPicture.getRecordingCanvas());
+
+        testStep->setAssertMessageFormat(kPictureReDrawAssertMessageFormat);
+
+        // Verify that deserialization-serialization round trip conserves all
+        // data by comparing referenceRecord to roundTripRecord
+        REPORTER_ASSERT_MESSAGE(reporter, referenceRecord->fBitmapIndex ==
+            roundTripRecord->fBitmapIndex, testStep->assertMessage());
+        REPORTER_ASSERT_MESSAGE(reporter, referenceRecord->fMatrixIndex ==
+            roundTripRecord->fMatrixIndex, testStep->assertMessage());
+        REPORTER_ASSERT_MESSAGE(reporter, referenceRecord->fPaintIndex ==
+            roundTripRecord->fPaintIndex, testStep->assertMessage());
+        REPORTER_ASSERT_MESSAGE(reporter, referenceRecord->fRegionIndex ==
+            roundTripRecord->fRegionIndex, testStep->assertMessage());
+        char referenceBuffer[kMaxPictureBufferSize];
+        SkMemoryWStream referenceStream(referenceBuffer,
+            kMaxPictureBufferSize);
+        referenceRecord->fWriter.writeToStream(&referenceStream);
+        char roundTripBuffer[kMaxPictureBufferSize];
+        SkMemoryWStream roundTripStream(roundTripBuffer,
+            kMaxPictureBufferSize);
+        roundTripRecord->fWriter.writeToStream(&roundTripStream);
+        REPORTER_ASSERT_MESSAGE(reporter,
+            roundTripStream.bytesWritten() == referenceStream.bytesWritten(),
+            testStep->assertMessage());
+        REPORTER_ASSERT_MESSAGE(reporter, 0 == memcmp(referenceBuffer,
+            roundTripBuffer, roundTripStream.bytesWritten()),
+            testStep->assertMessage());
+        REPORTER_ASSERT_MESSAGE(reporter, referenceRecord->fRecordFlags ==
+            roundTripRecord->fRecordFlags, testStep->assertMessage());
+        REPORTER_ASSERT_MESSAGE(reporter,
+            referenceRecord->fRestoreOffsetStack ==
+            roundTripRecord->fRestoreOffsetStack,
+            testStep->assertMessage());
+        AssertFlattenedObjectsEqual(referenceRecord, roundTripRecord,
+            reporter, testStep);
+        AssertCanvasStatesEqual(reporter, referenceRecord, roundTripRecord,
+            testStep);
+    }
+
+    static void TestPictureFlattenedObjectReuse(skiatest::Reporter* reporter, 
+                                         CanvasTestStep* testStep) {
+        // Verify that when a test step is executed twice, no extra resources
+        // are flattened during the second execution
+        testStep->setAssertMessageFormat(kPictureDrawAssertMessageFormat);
+        SkPicture referencePicture;
+        SkCanvas* referenceCanvas = referencePicture.beginRecording(kWidth,
+            kHeight);
+        testStep->draw(referenceCanvas, reporter);
+        SkPicture testPicture;
+        SkCanvas* testCanvas = testPicture.beginRecording(kWidth,
+            kHeight);
+        testStep->draw(testCanvas, reporter);
+        testStep->setAssertMessageFormat(kPictureSecondDrawAssertMessageFormat);
+        testStep->draw(testCanvas, reporter);
+
+        SkPictureRecord* referenceRecord = static_cast<SkPictureRecord*>(
+            referenceCanvas);
+        SkPictureRecord* testRecord = static_cast<SkPictureRecord*>(
+            testCanvas);
+        testStep->setAssertMessageFormat(kPictureResourceReuseMessageFormat);
+        AssertFlattenedObjectsEqual(referenceRecord, testRecord,
+            reporter, testStep);
+    }
+};
+
+static void TestPictureStateConsistency(skiatest::Reporter* reporter, 
+                                        CanvasTestStep* testStep,
+                                        const SkCanvas& referenceCanvas) {
+    // Verify that the recording canvas's state is consistent
+    // with that of a regular canvas
+    SkPicture testPicture;
+    SkCanvas* pictureCanvas = testPicture.beginRecording(kWidth, kHeight);
+    testStep->setAssertMessageFormat(kPictureDrawAssertMessageFormat);
+    testStep->draw(pictureCanvas, reporter);
+    testStep->setAssertMessageFormat(kPictureRecoringAssertMessageFormat);
+    AssertCanvasStatesEqual(reporter, pictureCanvas, &referenceCanvas,
+        testStep);
+
+    SkBitmap playbackStore;
+    createBitmap(&playbackStore, SkBitmap::kARGB_8888_Config, 0xFFFFFFFF);
+    SkDevice playbackDevice(playbackStore);
+    SkCanvas playbackCanvas(&playbackDevice);
+    testPicture.draw(&playbackCanvas);
+    testStep->setAssertMessageFormat(kPicturePlaybackAssertMessageFormat);
+    AssertCanvasStatesEqual(reporter, &playbackCanvas, &referenceCanvas,
+        testStep);
+
+    // The following test code is commented out because SkPicture is not
+    // currently expected to preserve state when restarting recording.
+    /*
+    SkCanvas* pictureCanvas = testPicture.beginRecording(kWidth, kHeight);
+    testStep->setAssertMessageFormat(kPictureResumeAssertMessageFormat);
+    AssertCanvasStatesEqual(reporter, pictureCanvas, &referenceCanvas,
+        testStep);
+    */
+}
+
+static void TestDeferredCanvasStateConsistency(
+    skiatest::Reporter* reporter,
+    CanvasTestStep* testStep,
+    const SkCanvas& referenceCanvas) {
+
+    SkBitmap deferredStore;
+    createBitmap(&deferredStore, SkBitmap::kARGB_8888_Config, 0xFFFFFFFF);
+    SkDevice deferredDevice(deferredStore);
+    SkDeferredCanvas deferredCanvas(&deferredDevice);
+    testStep->setAssertMessageFormat(kDeferredDrawAssertMessageFormat);
+    testStep->draw(&deferredCanvas, reporter);
+    testStep->setAssertMessageFormat(kDeferredPreFlushAssertMessageFormat);
+    AssertCanvasStatesEqual(reporter, &deferredCanvas, &referenceCanvas,
+        testStep);
+
+    // Verified that deferred canvas state is not affected by flushing
+    // pending draw operations
+
+    // The following test code is commented out because it currently fails.
+    // Issue: http://code.google.com/p/skia/issues/detail?id=496
+    /*
+    deferredCanvas.flush();
+    testStep->setAssertMessageFormat(kDeferredPostFlushAssertMessageFormat);
+    AssertCanvasStatesEqual(reporter, &deferredCanvas, &referenceCanvas,
+        testStep);
+    */
+}
+
+static void TestProxyCanvasStateConsistency(
+    skiatest::Reporter* reporter,
+    CanvasTestStep* testStep,
+    const SkCanvas& referenceCanvas) {
+
+    SkBitmap indirectStore;
+    createBitmap(&indirectStore, SkBitmap::kARGB_8888_Config, 0xFFFFFFFF);
+    SkDevice indirectDevice(indirectStore);
+    SkCanvas indirectCanvas(&indirectDevice);
+    SkProxyCanvas proxyCanvas(&indirectCanvas);
+    testStep->setAssertMessageFormat(kProxyDrawAssertMessageFormat);
+    testStep->draw(&proxyCanvas, reporter);
+    // Verify that the SkProxyCanvas reports consitent state
+    testStep->setAssertMessageFormat(kProxyStateAssertMessageFormat);
+    AssertCanvasStatesEqual(reporter, &proxyCanvas, &referenceCanvas,
+        testStep);
+    // Verify that the indirect canvas reports consitent state
+    testStep->setAssertMessageFormat(kProxyIndirectStateAssertMessageFormat);
+    AssertCanvasStatesEqual(reporter, &indirectCanvas, &referenceCanvas,
+        testStep);
+}
+
+static void TestNWayCanvasStateConsistency(
+    skiatest::Reporter* reporter,
+    CanvasTestStep* testStep,
+    const SkCanvas& referenceCanvas) {
+
+    SkBitmap indirectStore1;
+    createBitmap(&indirectStore1, SkBitmap::kARGB_8888_Config, 0xFFFFFFFF);
+    SkDevice indirectDevice1(indirectStore1);
+    SkCanvas indirectCanvas1(&indirectDevice1);
+
+    SkBitmap indirectStore2;
+    createBitmap(&indirectStore2, SkBitmap::kARGB_8888_Config, 0xFFFFFFFF);
+    SkDevice indirectDevice2(indirectStore2);
+    SkCanvas indirectCanvas2(&indirectDevice2);
+
+    SkNWayCanvas nWayCanvas;
+    nWayCanvas.addCanvas(&indirectCanvas1);
+    nWayCanvas.addCanvas(&indirectCanvas2);
+
+    testStep->setAssertMessageFormat(kNWayDrawAssertMessageFormat);
+    testStep->draw(&nWayCanvas, reporter);
+    // Verify that the SkProxyCanvas reports consitent state
+    testStep->setAssertMessageFormat(kNWayStateAssertMessageFormat);
+    AssertCanvasStatesEqual(reporter, &nWayCanvas, &referenceCanvas,
+        testStep);
+    // Verify that the indirect canvases report consitent state
+    testStep->setAssertMessageFormat(kNWayIndirect1StateAssertMessageFormat);
+    AssertCanvasStatesEqual(reporter, &indirectCanvas1, &referenceCanvas,
+        testStep);
+    testStep->setAssertMessageFormat(kNWayIndirect2StateAssertMessageFormat);
+    AssertCanvasStatesEqual(reporter, &indirectCanvas2, &referenceCanvas,
+        testStep);
+}
+
+/*
+ * This sub-test verifies that the test step passes when executed
+ * with SkCanvas and with classes derrived from SkCanvas. It also verifies
+ * that the all canvas derivatives report the same state as an SkCanvas
+ * after having executed the test step.
+ */
+static void TestOverrideStateConsistency(skiatest::Reporter* reporter, 
+                                         CanvasTestStep* testStep) {
+    SkBitmap referenceStore;
+    createBitmap(&referenceStore, SkBitmap::kARGB_8888_Config, 0xFFFFFFFF);
+    SkDevice referenceDevice(referenceStore);
+    SkCanvas referenceCanvas(&referenceDevice);
+    testStep->setAssertMessageFormat(kCanvasDrawAssertMessageFormat);
+    testStep->draw(&referenceCanvas, reporter);
+
+    TestPictureStateConsistency(reporter, testStep, referenceCanvas);
+    TestDeferredCanvasStateConsistency(reporter, testStep, referenceCanvas);
+
+    // The following test code is commented out because SkProxyCanvas is
+    // missing a lot of virtual overrides on get* methods, which are used
+    // to verify canvas state.
+    // Issue: http://code.google.com/p/skia/issues/detail?id=500
+    
+    //TestProxyCanvasStateConsistency(reporter, testStep, referenceCanvas);
+
+    // The following test code is commented out because SkNWayCanvas does not
+    // report correct clipping and device bounds information
+    // Issue: http://code.google.com/p/skia/issues/detail?id=501
+    
+    //TestNWayCanvasStateConsistency(reporter, testStep, referenceCanvas);
 }
 
 static void TestCanvas(skiatest::Reporter* reporter) {
-    SkBitmap bm;
-    bm.setConfig(SkBitmap::kARGB_8888_Config, 256, 256);
-    bm.allocPixels();
+    // Init global here because bitmap pixels cannot be alocated during
+    // static initialization
+    kTestBitmap = testBitmap();
 
-    SkCanvas canvas(bm);
-    int n;
-
-    REPORTER_ASSERT(reporter, 1 == canvas.getSaveCount());
-    n = canvas.save();
-    REPORTER_ASSERT(reporter, 1 == n);
-    REPORTER_ASSERT(reporter, 2 == canvas.getSaveCount());
-    canvas.save();
-    canvas.save();
-    REPORTER_ASSERT(reporter, 4 == canvas.getSaveCount());
-    canvas.restoreToCount(2);
-    REPORTER_ASSERT(reporter, 2 == canvas.getSaveCount());
-
-    // should this pin to 1, or be a no-op, or crash?
-    canvas.restoreToCount(0);
-    REPORTER_ASSERT(reporter, 1 == canvas.getSaveCount());
-
-    test_isDrawingToLayer(reporter);
+    for (int testStep = 0; testStep < testStepArray().count(); testStep++) {
+        TestOverrideStateConsistency(reporter, testStepArray()[testStep]);
+        SkPictureTester::TestPictureSerializationRoundTrip(reporter, 
+            testStepArray()[testStep]);
+        SkPictureTester::TestPictureFlattenedObjectReuse(reporter,
+            testStepArray()[testStep]);
+    }
 }
 
 #include "TestClassDef.h"
diff --git a/tests/ClipCubicTest.cpp b/tests/ClipCubicTest.cpp
index 931b61e..491d0e5 100644
--- a/tests/ClipCubicTest.cpp
+++ b/tests/ClipCubicTest.cpp
@@ -7,9 +7,27 @@
  */
 #include "Test.h"
 
+#include "SkCanvas.h"
+#include "SkPaint.h"
 #include "SkCubicClipper.h"
 #include "SkGeometry.h"
 
+// Currently the supersampler blitter uses int16_t for its index into an array
+// the width of the clip. Test that we don't crash/assert if we try to draw
+// with a device/clip that is larger.
+static void test_giantClip() {
+    SkBitmap bm;
+    bm.setConfig(SkBitmap::kARGB_8888_Config, 64919, 1);
+    bm.allocPixels();
+    SkCanvas canvas(bm);
+    canvas.clear(0);
+    
+    SkPath path;
+    path.moveTo(0, 0); path.lineTo(1, 0); path.lineTo(33, 1);
+    SkPaint paint;
+    paint.setAntiAlias(true);
+    canvas.drawPath(path, paint);
+}
 
 static void PrintCurve(const char *name, const SkPoint crv[4]) {
     printf("%s: %.10g, %.10g, %.10g, %.10g, %.10g, %.10g, %.10g, %.10g\n",
@@ -142,6 +160,8 @@
         1.297736168, 7.059780121,
         2.505550385, 10,
         shouldbe), tol));
+
+    test_giantClip();
 }
 
 
diff --git a/tests/ColorTest.cpp b/tests/ColorTest.cpp
index 0efb892..83e2e3f 100644
--- a/tests/ColorTest.cpp
+++ b/tests/ColorTest.cpp
@@ -7,7 +7,9 @@
  */
 #include "Test.h"
 #include "SkColor.h"
+#include "SkColorPriv.h"
 #include "SkMath.h"
+#include "SkRandom.h"
 #include "SkUnPreMultiply.h"
 
 static void test_premul(skiatest::Reporter* reporter) {
@@ -31,9 +33,49 @@
     }
 }
 
+/**
+  This test fails: SkFourByteInterp does *not* preserve opaque destinations.
+  SkAlpha255To256 implemented as (alpha + 1) is faster than
+  (alpha + (alpha >> 7)), but inaccurate, and Skia intends to phase it out.
+*/
+/*
+static void test_interp(skiatest::Reporter* reporter) {
+    SkRandom r;
+
+    U8CPU a0 = 0;
+    U8CPU a255 = 255;
+    for (int i = 0; i < 200; i++) {
+        SkColor colorSrc = r.nextU();
+        SkColor colorDst = r.nextU();
+        SkPMColor src = SkPreMultiplyColor(colorSrc);
+        SkPMColor dst = SkPreMultiplyColor(colorDst);
+
+        REPORTER_ASSERT(reporter, SkFourByteInterp(src, dst, a0) == dst);
+        REPORTER_ASSERT(reporter, SkFourByteInterp(src, dst, a255) == src);
+    }
+}
+*/
+
+static void test_fast_interp(skiatest::Reporter* reporter) {
+    SkRandom r;
+
+    U8CPU a0 = 0;
+    U8CPU a255 = 255;
+    for (int i = 0; i < 200; i++) {
+        SkColor colorSrc = r.nextU();
+        SkColor colorDst = r.nextU();
+        SkPMColor src = SkPreMultiplyColor(colorSrc);
+        SkPMColor dst = SkPreMultiplyColor(colorDst);
+
+        REPORTER_ASSERT(reporter, SkFastFourByteInterp(src, dst, a0) == dst);
+        REPORTER_ASSERT(reporter, SkFastFourByteInterp(src, dst, a255) == src);
+    }
+}
 
 static void TestColor(skiatest::Reporter* reporter) {
     test_premul(reporter);
+    //test_interp(reporter);
+    test_fast_interp(reporter);
 }
 
 #include "TestClassDef.h"
diff --git a/tests/DeferredCanvasTest.cpp b/tests/DeferredCanvasTest.cpp
new file mode 100644
index 0000000..17adb52
--- /dev/null
+++ b/tests/DeferredCanvasTest.cpp
@@ -0,0 +1,187 @@
+
+/*
+ * Copyright 2012 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+#include "Test.h"
+#include "SkBitmap.h"
+#include "SkDeferredCanvas.h"
+#include "SkShader.h"
+
+
+static const int gWidth = 2;
+static const int gHeight = 2;
+
+static void create(SkBitmap* bm, SkBitmap::Config config, SkColor color) {
+    bm->setConfig(config, gWidth, gHeight);
+    bm->allocPixels();
+    bm->eraseColor(color);
+}
+
+static void TestDeferredCanvasBitmapAccess(skiatest::Reporter* reporter) {
+    SkBitmap store;
+
+    create(&store, SkBitmap::kARGB_8888_Config, 0xFFFFFFFF);
+    SkDevice device(store);
+    SkDeferredCanvas canvas(&device);
+
+    canvas.clear(0x00000000);
+
+    SkAutoLockPixels alp(store);
+    REPORTER_ASSERT(reporter, store.getColor(0,0) == 0xFFFFFFFF); //verify that clear was deferred
+    SkBitmap accessed = canvas.getDevice()->accessBitmap(false);
+    REPORTER_ASSERT(reporter, store.getColor(0,0) == 0x00000000); //verify that clear was executed
+    REPORTER_ASSERT(reporter, accessed.pixelRef() == store.pixelRef());
+}
+
+static void TestDeferredCanvasFlush(skiatest::Reporter* reporter) {
+    SkBitmap store;
+
+    create(&store, SkBitmap::kARGB_8888_Config, 0xFFFFFFFF);
+    SkDevice device(store);
+    SkDeferredCanvas canvas(&device);
+
+    canvas.clear(0x00000000);
+
+    SkAutoLockPixels alp(store);
+    REPORTER_ASSERT(reporter, store.getColor(0,0) == 0xFFFFFFFF); //verify that clear was deferred
+    canvas.flush();
+    REPORTER_ASSERT(reporter, store.getColor(0,0) == 0x00000000); //verify that clear was executed
+}
+
+static void TestDeferredCanvasFreshFrame(skiatest::Reporter* reporter) {
+    SkBitmap store;
+    SkRect fullRect;
+    fullRect.setXYWH(SkIntToScalar(0), SkIntToScalar(0), SkIntToScalar(gWidth),
+        SkIntToScalar(gHeight));
+    SkRect partialRect;
+    partialRect.setXYWH(SkIntToScalar(0), SkIntToScalar(0),
+        SkIntToScalar(1), SkIntToScalar(1));
+    create(&store, SkBitmap::kARGB_8888_Config, 0xFFFFFFFF);
+    SkDevice device(store);
+    SkDeferredCanvas canvas(&device);
+
+    // verify that frame is intially fresh
+    REPORTER_ASSERT(reporter, canvas.getDeferredDevice()->isFreshFrame());
+    // no clearing op since last call to isFreshFrame -> not fresh
+    REPORTER_ASSERT(reporter, !canvas.getDeferredDevice()->isFreshFrame());
+
+    // Verify that clear triggers a fresh frame
+    canvas.clear(0x00000000);
+    REPORTER_ASSERT(reporter, canvas.getDeferredDevice()->isFreshFrame());
+
+    // Verify that clear with saved state triggers a fresh frame
+    canvas.save(SkCanvas::kMatrixClip_SaveFlag);
+    canvas.clear(0x00000000);
+    canvas.restore();
+    REPORTER_ASSERT(reporter, canvas.getDeferredDevice()->isFreshFrame());
+
+    // Verify that clear within a layer does NOT trigger a fresh frame
+    canvas.saveLayer(NULL, NULL, SkCanvas::kARGB_ClipLayer_SaveFlag);
+    canvas.clear(0x00000000);
+    canvas.restore();
+    REPORTER_ASSERT(reporter, !canvas.getDeferredDevice()->isFreshFrame());
+
+    // Verify that a clear with clipping triggers a fresh frame
+    // (clear is not affected by clipping)
+    canvas.save(SkCanvas::kMatrixClip_SaveFlag);
+    canvas.clipRect(partialRect, SkRegion::kIntersect_Op, false);
+    canvas.clear(0x00000000);
+    canvas.restore();
+    REPORTER_ASSERT(reporter, canvas.getDeferredDevice()->isFreshFrame());    
+
+    // Verify that full frame rects with different forms of opaque paint
+    // trigger frames to be marked as fresh
+    {
+        SkPaint paint;
+        paint.setStyle( SkPaint::kFill_Style );
+        paint.setAlpha( 255 );
+        canvas.drawRect(fullRect, paint);
+        REPORTER_ASSERT(reporter, canvas.getDeferredDevice()->isFreshFrame());
+    }
+    {
+        SkPaint paint;
+        paint.setStyle( SkPaint::kFill_Style );
+        SkBitmap bmp;
+        create(&bmp, SkBitmap::kARGB_8888_Config, 0xFFFFFFFF);
+        bmp.setIsOpaque(true);
+        SkShader* shader = SkShader::CreateBitmapShader(bmp, 
+            SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);
+        paint.setShader(shader)->unref();
+        canvas.drawRect(fullRect, paint);
+        REPORTER_ASSERT(reporter, canvas.getDeferredDevice()->isFreshFrame());        
+    }
+
+    // Verify that full frame rects with different forms of non-opaque paint
+    // do not trigger frames to be marked as fresh
+    {
+        SkPaint paint;
+        paint.setStyle( SkPaint::kFill_Style );
+        paint.setAlpha( 254 );
+        canvas.drawRect(fullRect, paint);
+        REPORTER_ASSERT(reporter, !canvas.getDeferredDevice()->isFreshFrame());
+    }
+    {
+        SkPaint paint;
+        paint.setStyle( SkPaint::kFill_Style );
+        SkBitmap bmp;
+        create(&bmp, SkBitmap::kARGB_8888_Config, 0xFFFFFFFF);
+        bmp.setIsOpaque(false);
+        SkShader* shader = SkShader::CreateBitmapShader(bmp, 
+            SkShader::kClamp_TileMode, SkShader::kClamp_TileMode);
+        paint.setShader(shader)->unref();
+        canvas.drawRect(fullRect, paint);
+        REPORTER_ASSERT(reporter, !canvas.getDeferredDevice()->isFreshFrame());        
+    }
+
+    // Verify that incomplete coverage does not trigger a fresh frame
+    {
+        SkPaint paint;
+        paint.setStyle(SkPaint::kFill_Style);
+        paint.setAlpha(255);
+        canvas.drawRect(partialRect, paint);
+        REPORTER_ASSERT(reporter, !canvas.getDeferredDevice()->isFreshFrame());
+    }
+
+    // Verify that incomplete coverage due to clipping does not trigger a fresh
+    // frame
+    {
+        canvas.save(SkCanvas::kMatrixClip_SaveFlag);
+        canvas.clipRect(partialRect, SkRegion::kIntersect_Op, false);
+        SkPaint paint;
+        paint.setStyle(SkPaint::kFill_Style);
+        paint.setAlpha(255);
+        canvas.drawRect(fullRect, paint);
+        REPORTER_ASSERT(reporter, !canvas.getDeferredDevice()->isFreshFrame());
+    }
+
+    // Verify that stroked rect does not trigger a fresh frame
+    {
+        SkPaint paint;
+        paint.setStyle( SkPaint::kStroke_Style );
+        paint.setAlpha( 255 );
+        canvas.drawRect(fullRect, paint);
+        REPORTER_ASSERT(reporter, !canvas.getDeferredDevice()->isFreshFrame());
+    }
+    
+    // Verify kSrcMode triggers a fresh frame even with transparent color
+    {
+        SkPaint paint;
+        paint.setStyle( SkPaint::kFill_Style );
+        paint.setAlpha( 100 );
+        paint.setXfermodeMode(SkXfermode::kSrc_Mode);
+        canvas.drawRect(fullRect, paint);
+        REPORTER_ASSERT(reporter, !canvas.getDeferredDevice()->isFreshFrame());
+    }
+}
+
+static void TestDeferredCanvas(skiatest::Reporter* reporter) {
+    TestDeferredCanvasBitmapAccess(reporter);
+    TestDeferredCanvasFlush(reporter);
+    TestDeferredCanvasFreshFrame(reporter);
+}
+
+#include "TestClassDef.h"
+DEFINE_TESTCLASS("DeferredCanvas", TestDeferredCanvasClass, TestDeferredCanvas)
diff --git a/tests/DrawPathTest.cpp b/tests/DrawPathTest.cpp
new file mode 100644
index 0000000..ae0068b
--- /dev/null
+++ b/tests/DrawPathTest.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2012 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "Test.h"
+#include "SkBitmap.h"
+#include "SkCanvas.h"
+
+static SkCanvas* create(SkBitmap::Config config, int w, int h, int rb,
+                        void* addr = NULL) {
+    SkBitmap bm;
+    bm.setConfig(config, w, h, rb);
+    if (addr) {
+        bm.setPixels(addr);
+    } else {
+        bm.allocPixels();
+    }
+    return new SkCanvas(bm);
+}
+
+// we used to assert if the bounds of the device (clip) was larger than 32K
+// even when the path itself was smaller. We just draw and hope in the debug
+// version to not assert.
+static void test_giantaa(skiatest::Reporter* reporter) {
+    const int W = 400;
+    const int H = 400;
+    SkCanvas* canvas = create(SkBitmap::kARGB_8888_Config, 33000, 10, 0, NULL);
+    canvas->clear(0);
+    
+    SkPaint paint;
+    paint.setAntiAlias(true);
+    SkPath path;
+    path.addOval(SkRect::MakeXYWH(-10, -10, 20 + W, 20 + H));
+    canvas->drawPath(path, paint);
+    canvas->unref();
+}
+
+static void TestDrawPath(skiatest::Reporter* reporter) {
+    test_giantaa(reporter);
+}
+
+#include "TestClassDef.h"
+DEFINE_TESTCLASS("DrawPath", TestDrawPathClass, TestDrawPath)
diff --git a/tests/DrawTextTest.cpp b/tests/DrawTextTest.cpp
new file mode 100644
index 0000000..aefe2f9
--- /dev/null
+++ b/tests/DrawTextTest.cpp
@@ -0,0 +1,115 @@
+/*

+ * Copyright 2011 Google Inc.

+ *

+ * Use of this source code is governed by a BSD-style license that can be

+ * found in the LICENSE file.

+ */

+#include "SkTypes.h"

+

+#include "Test.h"

+#include "SkBitmap.h"

+#include "SkCanvas.h"

+#include "SkColor.h"

+#include "SkPaint.h"

+#include "SkPoint.h"

+#include "SkRect.h"

+

+///////////////////////////////////////////////////////////////////////////////

+

+static const SkColor bgColor = SK_ColorWHITE;

+

+static void create(SkBitmap* bm, SkIRect bound, SkBitmap::Config config) {

+    bm->setConfig(config, bound.width(), bound.height());

+    bm->allocPixels();

+}

+

+static void drawBG(SkCanvas* canvas) {

+    canvas->drawColor(bgColor);

+}

+

+/** Assumes that the ref draw was completely inside ref canvas --

+    implies that everything outside is "bgColor".

+    Checks that all overlap is the same and that all non-overlap on the

+    ref is "bgColor".

+ */

+static bool compare(const SkBitmap& ref, const SkIRect& iref,

+                    const SkBitmap& test, const SkIRect& itest)

+{

+    const int xOff = itest.fLeft - iref.fLeft;

+    const int yOff = itest.fTop - iref.fTop;

+

+    SkAutoLockPixels alpRef(ref);

+    SkAutoLockPixels alpTest(test);

+

+    for (int y = 0; y < test.height(); ++y) {

+        for (int x = 0; x < test.width(); ++x) {

+            SkColor testColor = test.getColor(x, y);

+            int refX = x + xOff;

+            int refY = y + yOff;

+            SkColor refColor;

+            if (refX >= 0 && refX < ref.width() &&

+                refY >= 0 && refY < ref.height())

+            {

+                refColor = ref.getColor(refX, refY);

+            } else {

+                refColor = bgColor;

+            }

+            if (refColor != testColor) {

+                return false;

+            }

+        }

+    }

+    return true;

+}

+

+static void test_drawText(skiatest::Reporter* reporter) {

+

+    SkPaint paint;

+    paint.setColor(SK_ColorGRAY);

+    paint.setTextSize(SkIntToScalar(20));

+    

+    SkIRect drawTextRect = SkIRect::MakeWH(64, 64);

+    SkBitmap drawTextBitmap;

+    create(&drawTextBitmap, drawTextRect, SkBitmap::kARGB_8888_Config);

+    SkCanvas drawTextCanvas(drawTextBitmap);

+

+    SkIRect drawPosTextRect = SkIRect::MakeWH(64, 64);

+    SkBitmap drawPosTextBitmap;

+    create(&drawPosTextBitmap, drawPosTextRect, SkBitmap::kARGB_8888_Config);

+    SkCanvas drawPosTextCanvas(drawPosTextBitmap);

+

+    for (float offsetY = 0.0f; offsetY < 1.0f; offsetY += (1.0f / 16.0f)) {

+        for (float offsetX = 0.0f; offsetX < 1.0f; offsetX += (1.0f / 16.0f)) {

+            SkPoint point = SkPoint::Make(SkFloatToScalar(25.0f + offsetX),

+                                          SkFloatToScalar(25.0f + offsetY));

+

+            for (int align = 0; align < SkPaint::kAlignCount; ++align) {

+                paint.setTextAlign(static_cast<SkPaint::Align>(align));

+

+                for (unsigned int flags = 0; flags < (1 << 3); ++flags) {

+                    static const unsigned int antiAliasFlag = 1;

+                    static const unsigned int subpixelFlag = 1 << 1;

+                    static const unsigned int lcdFlag = 1 << 2;

+

+                    paint.setAntiAlias(SkToBool(flags & antiAliasFlag));

+                    paint.setSubpixelText(SkToBool(flags & subpixelFlag));

+                    paint.setLCDRenderText(SkToBool(flags & lcdFlag));

+

+                    // Test: drawText and drawPosText draw the same.

+                    drawBG(&drawTextCanvas);

+                    drawTextCanvas.drawText("A", 1, point.fX, point.fY, paint);

+

+                    drawBG(&drawPosTextCanvas);

+                    drawPosTextCanvas.drawPosText("A", 1, &point, paint);

+

+                    REPORTER_ASSERT(reporter,

+                        compare(drawTextBitmap, drawTextRect,

+                                drawPosTextBitmap, drawPosTextRect));

+                }

+            }

+        }

+    }

+}

+

+#include "TestClassDef.h"

+DEFINE_TESTCLASS("DrawText_DrawPosText", DrawTextTestClass, test_drawText)

diff --git a/tests/FontHostTest.cpp b/tests/FontHostTest.cpp
new file mode 100644
index 0000000..8ab7ad3
--- /dev/null
+++ b/tests/FontHostTest.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2012 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "Test.h"
+#include "SkTypeface.h"
+#include "SkFontHost.h"
+
+//#define DUMP_TABLES
+
+#define kFontTableTag_head          SkSetFourByteTag('h', 'e', 'a', 'd')
+#define kFontTableTag_hhea          SkSetFourByteTag('h', 'h', 'e', 'a')
+#define kFontTableTag_maxp          SkSetFourByteTag('m', 'a', 'x', 'p')
+
+static const struct TagSize {
+    SkFontTableTag  fTag;
+    size_t          fSize;
+} gKnownTableSizes[] = {
+    {   kFontTableTag_head,         54 },
+    {   kFontTableTag_hhea,         36 },
+    {   kFontTableTag_maxp,         32 },
+};
+
+static void test_tables(skiatest::Reporter* reporter, SkTypeface* face) {
+    SkFontID fontID = face->uniqueID();
+
+    int count = SkFontHost::CountTables(fontID);
+
+    SkAutoTMalloc<SkFontTableTag> storage(count);
+    SkFontTableTag* tags = storage.get();
+
+    int count2 = SkFontHost::GetTableTags(fontID, tags);
+    REPORTER_ASSERT(reporter, count2 == count);
+
+    for (int i = 0; i < count; ++i) {
+        size_t size = SkFontHost::GetTableSize(fontID, tags[i]);
+        REPORTER_ASSERT(reporter, size > 0);
+
+#ifdef DUMP_TABLES
+        char name[5];
+        name[0] = (tags[i] >> 24) & 0xFF;
+        name[1] = (tags[i] >> 16) & 0xFF;
+        name[2] = (tags[i] >>  8) & 0xFF;
+        name[3] = (tags[i] >>  0) & 0xFF;
+        name[4] = 0;
+        SkDebugf("%s %d\n", name, size);
+#endif
+
+        for (size_t j = 0; j < SK_ARRAY_COUNT(gKnownTableSizes); ++j) {
+            if (gKnownTableSizes[j].fTag == tags[i]) {
+                REPORTER_ASSERT(reporter, gKnownTableSizes[j].fSize == size);
+            }
+        }
+        
+        // do we get the same size from GetTableData and GetTableSize
+        {
+            SkAutoMalloc data(size);
+            size_t size2 = SkFontHost::GetTableData(fontID, tags[i], 0, size,
+                                                    data.get());
+            REPORTER_ASSERT(reporter, size2 == size);
+        }
+    }
+}
+
+static void test_tables(skiatest::Reporter* reporter) {
+    static const char* const gNames[] = {
+        NULL,   // default font
+        "Arial", "Times", "Times New Roman", "Helvetica", "Courier",
+        "Courier New",
+    };
+
+    for (size_t i = 0; i < SK_ARRAY_COUNT(gNames); ++i) {
+        SkTypeface* face = SkTypeface::CreateFromName(gNames[i],
+                                                      SkTypeface::kNormal);
+        if (face) {
+#ifdef DUMP_TABLES
+            SkDebugf("%s\n", gNames[i]);
+#endif
+            test_tables(reporter, face);
+            face->unref();
+        }
+    }
+}
+
+static void TestFontHost(skiatest::Reporter* reporter) {
+    test_tables(reporter);
+}
+
+// need tests for SkStrSearch
+
+#include "TestClassDef.h"
+DEFINE_TESTCLASS("FontHost", FontHostTestClass, TestFontHost)
diff --git a/tests/GLInterfaceValidation.cpp b/tests/GLInterfaceValidation.cpp
index 2be13f0..5cee0e4 100755
--- a/tests/GLInterfaceValidation.cpp
+++ b/tests/GLInterfaceValidation.cpp
@@ -7,8 +7,8 @@
  */
 
 #include "Test.h"
-#include "SkNativeGLContext.h"
-#include "SkMesaGLContext.h"
+#include "gl/SkNativeGLContext.h"
+#include "gl/SkMesaGLContext.h"
 
 static void GLInterfaceValidationTest(skiatest::Reporter* reporter) {
     typedef const GrGLInterface* (*interfaceFactory)();
@@ -51,7 +51,13 @@
         iface.reset(interfaceFactories[i].fFactory());
         REPORTER_ASSERT(reporter, NULL != iface.get());
         if (iface.get()) {
-            REPORTER_ASSERT(reporter, iface.get()->validate());
+            for (GrGLBinding binding = kFirstGrGLBinding;
+                 binding <= kLastGrGLBinding;
+                 binding = static_cast<GrGLBinding>(binding << 1)) {
+                if (iface.get()->fBindingsExported & binding) {
+                    REPORTER_ASSERT(reporter, iface.get()->validate(binding));
+                }
+            }
         }
     }
 }
diff --git a/tests/GLProgramsTest.cpp b/tests/GLProgramsTest.cpp
index 5cacade..583b802 100644
--- a/tests/GLProgramsTest.cpp
+++ b/tests/GLProgramsTest.cpp
@@ -8,7 +8,7 @@
 
 #include "Test.h"
 #include "GrContext.h"
-#include "GrGpuGLShaders.h"
+#include "gl/GrGpuGLShaders.h"
 
 static void GLProgramsTest(skiatest::Reporter* reporter, GrContext* context) {
     GrGpuGLShaders* shadersGpu = (GrGpuGLShaders*) context->getGpu();
diff --git a/tests/GeometryTest.cpp b/tests/GeometryTest.cpp
index 6158a20..9a0f78f 100644
--- a/tests/GeometryTest.cpp
+++ b/tests/GeometryTest.cpp
@@ -12,6 +12,25 @@
     return SkScalarNearlyEqual(a.fX, b.fX) && SkScalarNearlyEqual(a.fY, b.fY);
 }
 
+static void testChopCubic(skiatest::Reporter* reporter) {
+    /*
+        Inspired by this test, which used to assert that the tValues had dups
+     
+        <path stroke="#202020" d="M0,0 C0,0 1,1 2190,5130 C2190,5070 2220,5010 2205,4980" />
+     */
+    const SkPoint src[] = {
+        { SkIntToScalar(2190), SkIntToScalar(5130) },
+        { SkIntToScalar(2190), SkIntToScalar(5070) },
+        { SkIntToScalar(2220), SkIntToScalar(5010) },
+        { SkIntToScalar(2205), SkIntToScalar(4980) },
+    };
+    SkPoint dst[13];
+    SkScalar tValues[3];
+    // make sure we don't assert internally
+    int count = SkChopCubicAtMaxCurvature(src, dst, tValues);
+}
+
+
 static void TestGeometry(skiatest::Reporter* reporter) {
     SkPoint pts[3], dst[5];
 
@@ -35,6 +54,8 @@
     for (int i = 0; i < 4; ++i) {
         REPORTER_ASSERT(reporter, nearly_equal(cubic[i], dst[i]));
     }
+    
+    testChopCubic(reporter);
 }
 
 #include "TestClassDef.h"
diff --git a/tests/PathMeasureTest.cpp b/tests/PathMeasureTest.cpp
index d454e37..2ff9f3a 100644
--- a/tests/PathMeasureTest.cpp
+++ b/tests/PathMeasureTest.cpp
@@ -43,6 +43,91 @@
                  d, p.fX, p.fY, v.fX, v.fY);
 #endif
     }
+
+    // Test the behavior following a close not followed by a move.
+    path.reset();
+    path.lineTo(SK_Scalar1, 0);
+    path.lineTo(SK_Scalar1, SK_Scalar1);
+    path.lineTo(0, SK_Scalar1);
+    path.close();
+    path.lineTo(-SK_Scalar1, 0);
+    meas.setPath(&path, false);
+    length = meas.getLength();
+    REPORTER_ASSERT(reporter, length == SK_Scalar1 * 4);
+    meas.nextContour();
+    length = meas.getLength();
+    REPORTER_ASSERT(reporter, length == SK_Scalar1);
+    SkPoint position;
+    SkVector tangent;
+    REPORTER_ASSERT(reporter, meas.getPosTan(SK_ScalarHalf, &position, &tangent));
+    REPORTER_ASSERT(reporter,
+        SkScalarNearlyEqual(position.fX, -SK_ScalarHalf, SK_Scalar1 * 0.0001));
+    REPORTER_ASSERT(reporter, position.fY == 0);
+    REPORTER_ASSERT(reporter, tangent.fX == -SK_Scalar1);
+    REPORTER_ASSERT(reporter, tangent.fY == 0);
+
+    // Test degenerate paths
+    path.reset();
+    path.moveTo(0, 0);
+    path.lineTo(0, 0);
+    path.lineTo(SK_Scalar1, 0);
+    path.quadTo(SK_Scalar1, 0, SK_Scalar1, 0);
+    path.quadTo(SK_Scalar1, SK_Scalar1, SK_Scalar1, SK_Scalar1 * 2);
+    path.cubicTo(SK_Scalar1, SK_Scalar1 * 2,
+                 SK_Scalar1, SK_Scalar1 * 2,
+                 SK_Scalar1, SK_Scalar1 * 2);
+    path.cubicTo(SK_Scalar1*2, SK_Scalar1 * 2,
+                 SK_Scalar1*3, SK_Scalar1 * 2,
+                 SK_Scalar1*4, SK_Scalar1 * 2);
+    meas.setPath(&path, false);
+    length = meas.getLength();
+    REPORTER_ASSERT(reporter, length == SK_Scalar1 * 6);
+    REPORTER_ASSERT(reporter, meas.getPosTan(SK_ScalarHalf, &position, &tangent));
+    REPORTER_ASSERT(reporter,
+        SkScalarNearlyEqual(position.fX, SK_ScalarHalf, SK_Scalar1 * 0.0001));
+    REPORTER_ASSERT(reporter, position.fY == 0);
+    REPORTER_ASSERT(reporter, tangent.fX == SK_Scalar1);
+    REPORTER_ASSERT(reporter, tangent.fY == 0);
+    REPORTER_ASSERT(reporter, meas.getPosTan(SK_Scalar1 * 2.5f, &position, &tangent));
+    REPORTER_ASSERT(reporter,
+        SkScalarNearlyEqual(position.fX, SK_Scalar1, SK_Scalar1 * 0.0001));
+    REPORTER_ASSERT(reporter,
+        SkScalarNearlyEqual(position.fY, SK_Scalar1 * 1.5f));
+    REPORTER_ASSERT(reporter, tangent.fX == 0);
+    REPORTER_ASSERT(reporter, tangent.fY == SK_Scalar1);
+    REPORTER_ASSERT(reporter, meas.getPosTan(SK_Scalar1 * 4.5f, &position, &tangent));
+    REPORTER_ASSERT(reporter,
+        SkScalarNearlyEqual(position.fX, SK_Scalar1 * 2.5f, SK_Scalar1 * 0.0001));
+    REPORTER_ASSERT(reporter,
+        SkScalarNearlyEqual(position.fY, SK_Scalar1 * 2.0f, SK_Scalar1 * 0.0001));
+    REPORTER_ASSERT(reporter, tangent.fX == SK_Scalar1);
+    REPORTER_ASSERT(reporter, tangent.fY == 0);
+
+    path.reset();
+    path.moveTo(0, 0);
+    path.lineTo(SK_Scalar1, 0);
+    path.moveTo(SK_Scalar1, SK_Scalar1);
+    path.moveTo(SK_Scalar1 * 2, SK_Scalar1 * 2);
+    path.lineTo(SK_Scalar1, SK_Scalar1 * 2);
+    meas.setPath(&path, false);
+    length = meas.getLength();
+    REPORTER_ASSERT(reporter, length == SK_Scalar1);
+    REPORTER_ASSERT(reporter, meas.getPosTan(SK_ScalarHalf, &position, &tangent));
+    REPORTER_ASSERT(reporter,
+        SkScalarNearlyEqual(position.fX, SK_ScalarHalf, SK_Scalar1 * 0.0001));
+    REPORTER_ASSERT(reporter, position.fY == 0);
+    REPORTER_ASSERT(reporter, tangent.fX == SK_Scalar1);
+    REPORTER_ASSERT(reporter, tangent.fY == 0);
+    meas.nextContour();
+    length = meas.getLength();
+    REPORTER_ASSERT(reporter, length == SK_Scalar1);
+    REPORTER_ASSERT(reporter, meas.getPosTan(SK_ScalarHalf, &position, &tangent));
+    REPORTER_ASSERT(reporter,
+        SkScalarNearlyEqual(position.fX, SK_Scalar1 * 1.5f, SK_Scalar1 * 0.0001));
+    REPORTER_ASSERT(reporter,
+        SkScalarNearlyEqual(position.fY, SK_Scalar1 * 2.0f, SK_Scalar1 * 0.0001));
+    REPORTER_ASSERT(reporter, tangent.fX == -SK_Scalar1);
+    REPORTER_ASSERT(reporter, tangent.fY == 0);
 }
 
 #include "TestClassDef.h"
diff --git a/tests/PathTest.cpp b/tests/PathTest.cpp
index fadb0d9..868ce31 100644
--- a/tests/PathTest.cpp
+++ b/tests/PathTest.cpp
@@ -15,6 +15,21 @@
 #include "SkSize.h"
 #include "SkWriter32.h"
 
+/**
+ * cheapIsDirection can take a shortcut when a path is marked convex.
+ * This function ensures that we always test cheapIsDirection when the path
+ * is flagged with unknown convexity status.
+ */
+static void check_direction(SkPath* path,
+                            SkPath::Direction expectedDir,
+                            skiatest::Reporter* reporter) {
+    if (SkPath::kConvex_Convexity == path->getConvexity()) {
+        REPORTER_ASSERT(reporter, path->cheapIsDirection(expectedDir));
+        path->setConvexity(SkPath::kUnknown_Convexity);
+    }
+    REPORTER_ASSERT(reporter, path->cheapIsDirection(expectedDir));
+}
+
 static void test_direction(skiatest::Reporter* reporter) {
     size_t i;
     SkPath path;
@@ -40,24 +55,48 @@
     static const char* gCW[] = {
         "M 10 10 L 10 10 Q 20 10 20 20",
         "M 10 10 C 20 10 20 20 20 20",
+        "M 20 10 Q 20 20 30 20 L 10 20", // test double-back at y-max
     };
     for (i = 0; i < SK_ARRAY_COUNT(gCW); ++i) {
         path.reset();
         bool valid = SkParsePath::FromSVGString(gCW[i], &path);
         REPORTER_ASSERT(reporter, valid);
-        REPORTER_ASSERT(reporter, path.cheapIsDirection(SkPath::kCW_Direction));
+        check_direction(&path, SkPath::kCW_Direction, reporter);
     }
     
     static const char* gCCW[] = {
         "M 10 10 L 10 10 Q 20 10 20 -20",
         "M 10 10 C 20 10 20 -20 20 -20",
+        "M 20 10 Q 20 20 10 20 L 30 20", // test double-back at y-max
     };
     for (i = 0; i < SK_ARRAY_COUNT(gCCW); ++i) {
         path.reset();
         bool valid = SkParsePath::FromSVGString(gCCW[i], &path);
         REPORTER_ASSERT(reporter, valid);
-        REPORTER_ASSERT(reporter, path.cheapIsDirection(SkPath::kCCW_Direction));
+        check_direction(&path, SkPath::kCCW_Direction, reporter);
     }
+
+    // Test two donuts, each wound a different direction. Only the outer contour
+    // determines the cheap direction
+    path.reset();
+    path.addCircle(0, 0, SkIntToScalar(2), SkPath::kCW_Direction);
+    path.addCircle(0, 0, SkIntToScalar(1), SkPath::kCCW_Direction);
+    check_direction(&path, SkPath::kCW_Direction, reporter);
+
+    path.reset();
+    path.addCircle(0, 0, SkIntToScalar(1), SkPath::kCW_Direction);
+    path.addCircle(0, 0, SkIntToScalar(2), SkPath::kCCW_Direction);
+    check_direction(&path, SkPath::kCCW_Direction, reporter);
+
+#ifdef SK_SCALAR_IS_FLOAT
+    // triangle with one point really far from the origin.
+    path.reset();
+    // the first point is roughly 1.05e10, 1.05e10
+    path.moveTo(SkFloatToScalar(SkBits2Float(0x501c7652)), SkFloatToScalar(SkBits2Float(0x501c7652)));
+    path.lineTo(110 * SK_Scalar1, -10 * SK_Scalar1);
+    path.lineTo(-10 * SK_Scalar1, 60 * SK_Scalar1);
+    check_direction(&path, SkPath::kCCW_Direction, reporter);
+#endif
 }
 
 static void add_rect(SkPath* path, const SkRect& r) {
@@ -889,12 +928,14 @@
     // Max of 10 segments, max 3 points per segment
     SkRandom rand(9876543);
     SkPoint          expectedPts[31]; // May have leading moveTo
-    SkPath::Verb     expectedVerbs[11]; // May have leading moveTo
+    SkPath::Verb     expectedVerbs[22]; // May have leading moveTo
     SkPath::Verb     nextVerb;
+
     for (int i = 0; i < 500; ++i) {
         p.reset();
         bool lastWasClose = true;
         bool haveMoveTo = false;
+        SkPoint lastMoveToPt = { 0, 0 };
         int numPoints = 0;
         int numVerbs = (rand.nextU() >> 16) % 10;
         int numIterVerbs = 0;
@@ -907,13 +948,14 @@
                 case SkPath::kMove_Verb:
                     expectedPts[numPoints] = randomPts[(rand.nextU() >> 16) % 25];
                     p.moveTo(expectedPts[numPoints]);
+                    lastMoveToPt = expectedPts[numPoints];
                     numPoints += 1;
                     lastWasClose = false;
                     haveMoveTo = true;
                     break;
                 case SkPath::kLine_Verb:
                     if (!haveMoveTo) {
-                        expectedPts[numPoints++].set(0, 0);
+                        expectedPts[numPoints++] = lastMoveToPt;
                         expectedVerbs[numIterVerbs++] = SkPath::kMove_Verb;
                         haveMoveTo = true;
                     }
@@ -924,7 +966,7 @@
                     break;
                 case SkPath::kQuad_Verb:
                     if (!haveMoveTo) {
-                        expectedPts[numPoints++].set(0, 0);
+                        expectedPts[numPoints++] = lastMoveToPt;
                         expectedVerbs[numIterVerbs++] = SkPath::kMove_Verb;
                         haveMoveTo = true;
                     }
@@ -936,7 +978,7 @@
                     break;
                 case SkPath::kCubic_Verb:
                     if (!haveMoveTo) {
-                        expectedPts[numPoints++].set(0, 0);
+                        expectedPts[numPoints++] = lastMoveToPt;
                         expectedVerbs[numIterVerbs++] = SkPath::kMove_Verb;
                         haveMoveTo = true;
                     }
@@ -950,6 +992,7 @@
                     break;
                 case SkPath::kClose_Verb:
                     p.close();
+                    haveMoveTo = false;
                     lastWasClose = true;
                     break;
                 default:;
diff --git a/tests/PremulAlphaRoundTripTest.cpp b/tests/PremulAlphaRoundTripTest.cpp
new file mode 100644
index 0000000..c4ec6ab
--- /dev/null
+++ b/tests/PremulAlphaRoundTripTest.cpp
@@ -0,0 +1,106 @@
+
+/*
+ * Copyright 2011 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "Test.h"
+#include "SkCanvas.h"
+#include "SkConfig8888.h"
+#include "SkGpuDevice.h"
+
+
+namespace {
+
+void fillCanvas(SkCanvas* canvas, SkCanvas::Config8888 unpremulConfig) {
+    SkBitmap bmp;
+    bmp.setConfig(SkBitmap::kARGB_8888_Config, 256, 256);
+    bmp.allocPixels();
+    SkAutoLockPixels alp(bmp);
+    uint32_t* pixels = reinterpret_cast<uint32_t*>(bmp.getPixels());
+
+    for (int a = 0; a < 256; ++a) {
+        for (int r = 0; r < 256; ++r) {
+            pixels[a * 256 + r] = SkPackConfig8888(unpremulConfig, a, r, 0, 0);
+        }
+    }
+    canvas->writePixels(bmp, 0, 0, unpremulConfig);
+}
+
+static const SkCanvas::Config8888 gUnpremulConfigs[] = {
+    SkCanvas::kNative_Unpremul_Config8888,
+/**
+ * There is a bug in Ganesh (http://code.google.com/p/skia/issues/detail?id=438)
+ * that causes the readback of pixels from BGRA canvas to an RGBA bitmap to
+ * fail. This should be removed as soon as the issue above is resolved.
+ */
+#if !defined(SK_BUILD_FOR_ANDROID)
+    SkCanvas::kBGRA_Unpremul_Config8888,
+#endif
+    SkCanvas::kRGBA_Unpremul_Config8888,
+};
+
+void PremulAlphaRoundTripTest(skiatest::Reporter* reporter,
+                              GrContext* context) {
+    SkCanvas canvas;
+    for (int dtype = 0; dtype < 2; ++dtype) {
+        if (0 == dtype) {
+            canvas.setDevice(new SkDevice(SkBitmap::kARGB_8888_Config,
+                                          256,
+                                          256,
+                                          false))->unref();
+        } else {
+#if SK_SCALAR_IS_FIXED
+            // GPU device known not to work in the fixed pt build.
+            continue;
+#endif
+            canvas.setDevice(new SkGpuDevice(context,
+                                             SkBitmap::kARGB_8888_Config,
+                                             256,
+                                             256))->unref();
+        }
+
+        SkBitmap readBmp1;
+        readBmp1.setConfig(SkBitmap::kARGB_8888_Config, 256, 256);
+        readBmp1.allocPixels();
+        SkBitmap readBmp2;
+        readBmp2.setConfig(SkBitmap::kARGB_8888_Config, 256, 256);
+        readBmp2.allocPixels();
+
+        for (size_t upmaIdx = 0;
+             upmaIdx < SK_ARRAY_COUNT(gUnpremulConfigs);
+             ++upmaIdx) {
+            fillCanvas(&canvas, gUnpremulConfigs[upmaIdx]);
+            {
+                SkAutoLockPixels alp1(readBmp1);
+                SkAutoLockPixels alp2(readBmp2);
+                sk_bzero(readBmp1.getPixels(), readBmp1.getSafeSize());
+                sk_bzero(readBmp2.getPixels(), readBmp2.getSafeSize());
+            }
+
+            canvas.readPixels(&readBmp1, 0, 0, gUnpremulConfigs[upmaIdx]);
+            canvas.writePixels(readBmp1, 0, 0, gUnpremulConfigs[upmaIdx]);
+            canvas.readPixels(&readBmp2, 0, 0, gUnpremulConfigs[upmaIdx]);
+
+            SkAutoLockPixels alp1(readBmp1);
+            SkAutoLockPixels alp2(readBmp2);
+            uint32_t* pixels1 =
+                reinterpret_cast<uint32_t*>(readBmp1.getPixels());
+            uint32_t* pixels2 =
+                reinterpret_cast<uint32_t*>(readBmp2.getPixels());
+            for (int y = 0; y < 256; ++y) {
+                for (int x = 0; x < 256; ++x) {
+                    int i = y * 256 + x;
+                    REPORTER_ASSERT(reporter, pixels1[i] == pixels2[i]);
+                }
+            }
+        }
+    }
+}
+}
+
+#include "TestClassDef.h"
+DEFINE_GPUTESTCLASS("PremulAlphaRoundTripTest", PremulAlphaRoundTripTestClass, PremulAlphaRoundTripTest)
+
diff --git a/tests/ReadPixelsTest.cpp b/tests/ReadPixelsTest.cpp
index b531e92..4e0fcc6 100644
--- a/tests/ReadPixelsTest.cpp
+++ b/tests/ReadPixelsTest.cpp
@@ -297,7 +297,7 @@
         SkIRect::MakeLTRB(3 * DEV_W / 4, -10, DEV_W + 10, DEV_H + 10),
     };
 
-    for (int dtype = 1; dtype < 2; ++dtype) {
+    for (int dtype = 0; dtype < 2; ++dtype) {
 
         if (0 == dtype) {
             canvas.setDevice(new SkDevice(SkBitmap::kARGB_8888_Config,
diff --git a/tests/Test.cpp b/tests/Test.cpp
index 1c3b691..62df731 100644
--- a/tests/Test.cpp
+++ b/tests/Test.cpp
@@ -8,7 +8,7 @@
 #include "Test.h"
 
 #include "GrContext.h"
-#include "SkNativeGLContext.h"
+#include "gl/SkNativeGLContext.h"
 #include "SkTLazy.h"
 
 using namespace skiatest;
diff --git a/tests/Test.h b/tests/Test.h
index 8728040..4ca1971 100644
--- a/tests/Test.h
+++ b/tests/Test.h
@@ -115,10 +115,19 @@
     do {                                                                \
         if (!(cond)) {                                                  \
             SkString desc;                                              \
-            desc.printf("%s:%d: %s", __FILE__, __LINE__, #cond);      \
+            desc.printf("%s:%d: %s", __FILE__, __LINE__, #cond);        \
             r->reportFailed(desc);                                      \
         }                                                               \
     } while(0)
 
+#define REPORTER_ASSERT_MESSAGE(r, cond, message)                            \
+    do {                                                                     \
+        if (!(cond)) {                                                       \
+            SkString desc;                                                   \
+            desc.printf("%s %s:%d: %s", message, __FILE__, __LINE__, #cond); \
+            r->reportFailed(desc);                                           \
+        }                                                                    \
+    } while(0)
+
 
 #endif
diff --git a/tests/WArrayTest.cpp b/tests/WArrayTest.cpp
index 428ca5f..daab543 100644
--- a/tests/WArrayTest.cpp
+++ b/tests/WArrayTest.cpp
@@ -108,7 +108,7 @@
     bool leadingSpace = false;
     while (data != NULL) {
       if (leadingSpace) {
-        result.appendf(" ");
+        result.append(" ");
       } else {
         leadingSpace = true;
       }
@@ -121,11 +121,11 @@
           result.appendf("%d[", data->fStartId);
           for (int i = 0; i < data->fAdvance.count(); ++i) {
             if (i > 0) {
-              result.appendf(" ");
+              result.append(" ");
             }
             result.appendf("%d", data->fAdvance[i]);
           }
-          result.appendf("]");
+          result.append("]");
           break;
         case SkAdvancedTypefaceMetrics::AdvanceMetric<int16_t>::kDefault:
           result.appendf("<Default=%d>", data->fAdvance[0]);
diff --git a/tests/WritePixelsTest.cpp b/tests/WritePixelsTest.cpp
index 0c5b7b9..403ab84 100644
--- a/tests/WritePixelsTest.cpp
+++ b/tests/WritePixelsTest.cpp
@@ -233,6 +233,7 @@
     intptr_t canvasPixels = reinterpret_cast<intptr_t>(devBmp.getPixels());
     size_t canvasRowBytes = devBmp.rowBytes();
     SkIRect writeRect = SkIRect::MakeXYWH(writeX, writeY, bitmap.width(), bitmap.height());
+    bool success = true;
     for (int cy = 0; cy < DEV_H; ++cy) {
         const SkPMColor* canvasRow = reinterpret_cast<const SkPMColor*>(canvasPixels);
         for (int cx = 0; cx < DEV_W; ++cx) {
@@ -246,14 +247,14 @@
                 bool check;
                 REPORTER_ASSERT(reporter, check = checkPixel(bmpPMColor, canvasPixel, mul));
                 if (!check) {
-                    return false;
+                    success = false;
                 }
             } else {
                 bool check;
                 SkPMColor testColor = getCanvasColor(cx, cy);
                 REPORTER_ASSERT(reporter, check = (canvasPixel == testColor));
                 if (!check) {
-                    return false;
+                    success = false;
                 }
             }
         }
@@ -263,14 +264,14 @@
                 bool check;
                 REPORTER_ASSERT(reporter, check = (pad[px] == static_cast<char>(DEV_PAD)));
                 if (!check) {
-                    return false;
+                    success = false;
                 }
             }
         }
         canvasPixels += canvasRowBytes;
     }
 
-    return true;
+    return success;
 }
 
 enum DevType {
diff --git a/third_party/glu/gluos.h b/third_party/glu/gluos.h
index 5da46a5..e94c679 100644
--- a/third_party/glu/gluos.h
+++ b/third_party/glu/gluos.h
@@ -71,7 +71,9 @@
 #define __gl_edgeSign Sk__gl_edgeSign
 #define __gl_memInit Sk__gl_memInit
 #define __gl_meshAddEdgeVertex Sk__gl_meshAddEdgeVertex
+#ifndef NDEBUG
 #define __gl_meshCheckMesh Sk__gl_meshCheckMesh
+#endif
 #define __gl_meshConnect Sk__gl_meshConnect
 #define __gl_meshDelete Sk__gl_meshDelete
 #define __gl_meshDeleteMesh Sk__gl_meshDeleteMesh